From 23c4d46790c4de576555eee5fed73b3692280405 Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Tue, 11 Aug 2020 11:44:22 +0200
Subject: [PATCH 01/14] Over squashed 100 IFX Commits (stripped down to
 essentials)

---
 .vscode/c_cpp_properties.json          |   22 -
 .vscode/launch.json                    |   65 --
 .vscode/tasks.json                     |   34 -
 CMakeLists.txt                         |   29 +-
 Makefile                               |   27 +-
 README.md                              |   10 +-
 common.mk                              |   22 +-
 src/BuiltinAllocations.cc              |  136 +++
 src/BuiltinAllocations.h               |   20 +
 src/CodeWriter.cc                      |  278 +++----
 src/CodeWriter.h                       |   61 +-
 src/Compiler.cc                        | 1051 +++++++++++++++++++-----
 src/Compiler.h                         |  111 +--
 src/CustomOperators.cc                 |   19 +-
 src/CustomOperators.h                  |    2 +-
 src/Makefile.inc                       |    9 +-
 src/MemMap.cc                          |  196 ++++-
 src/MemMap.h                           |  123 ++-
 src/ModelInfo.h                        |   52 ++
 src/Options.h                          |   23 +
 src/RecordAllocations.cc               |  185 ++++-
 src/RecordAllocations.h                |   40 +-
 src/TypeToString.cc                    |    2 +-
 src/main.cc                            |   75 +-
 tflite_u_preint/static_data_utils.cc   |   99 +++
 tflite_u_preint/static_data_utils.h    |   41 +
 tflite_u_preint/static_init_support.cc |  430 ++++++++++
 tflite_u_preint/static_init_support.h  |  528 ++++++++++++
 28 files changed, 3042 insertions(+), 648 deletions(-)
 delete mode 100644 .vscode/c_cpp_properties.json
 delete mode 100644 .vscode/launch.json
 delete mode 100644 .vscode/tasks.json
 create mode 100644 src/BuiltinAllocations.cc
 create mode 100644 src/BuiltinAllocations.h
 create mode 100644 src/ModelInfo.h
 create mode 100644 src/Options.h
 create mode 100644 tflite_u_preint/static_data_utils.cc
 create mode 100644 tflite_u_preint/static_data_utils.h
 create mode 100644 tflite_u_preint/static_init_support.cc
 create mode 100644 tflite_u_preint/static_init_support.h

diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
deleted file mode 100644
index 8b32d41..0000000
--- a/.vscode/c_cpp_properties.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-    "configurations": [
-        {
-            "name": "Linux",
-            "includePath": [
-                "${workspaceFolder}/**",
-                "${workspaceFolder}/../tensorflow",
-                "${workspaceFolder}/../tensorflow/tensorflow/lite/micro/tools/make/downloads/",
-                "${workspaceFolder}/../tensorflow/tensorflow/lite/micro/tools/make/downloads/gemmlowp",
-                "${workspaceFolder}/../tensorflow/tensorflow/lite/micro/tools/make/downloads/flatbuffers/include",
-                "${workspaceFolder}/../tensorflow/tensorflow/lite/micro/tools/make/downloads/ruy",
-                "${workspaceFolder}/../tensorflow/tensorflow/lite/micro/tools/make/downloads/kissfft"
-           ],
-            "defines": [ "TF_LITE_STATIC_MEMORY", "NDEBUG", "TF_LITE_DISABLE_X86_NEON", "SUFFICIENT_ARENA_SIZE" ],
-            "compilerPath": "/usr/bin/g++",
-            "cStandard": "c11",
-            "cppStandard": "c++17",
-            "intelliSenseMode": "clang-x64"
-        }
-    ],
-    "version": 4
-}
diff --git a/.vscode/launch.json b/.vscode/launch.json
deleted file mode 100644
index df88327..0000000
--- a/.vscode/launch.json
+++ /dev/null
@@ -1,65 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "(gdb) hello example Launch",
-            "type": "cppdbg",
-            "request": "launch",
-            "program": "${workspaceFolder}/example/hello_world",
-            "args": [],
-            "stopAtEntry": true,
-            "cwd": "${workspaceFolder}",
-            "environment": [],
-            "externalConsole": false,
-            "MIMode": "gdb",
-            "setupCommands": [
-                {
-                    "description": "Enable pretty-printing for gdb",
-                    "text": "-enable-pretty-printing",
-                    "ignoreFailures": true
-                }
-            ]
-        },
-        {
-            "name": "(gdb) cifar compiled example Launch",
-            "type": "cppdbg",
-            "request": "launch",
-            "program": "${workspaceFolder}/examples/cifar10_compiled",
-            "args": [],
-            "stopAtEntry": true,
-            "cwd": "${workspaceFolder}",
-            "environment": [],
-            "externalConsole": false,
-            "MIMode": "gdb",
-            "setupCommands": [
-                {
-                    "description": "Enable pretty-printing for gdb",
-                    "text": "-enable-pretty-printing",
-                    "ignoreFailures": true
-                }
-            ]
-        },
-        {
-            "name": "(gdb) cifar interpreter example Launch",
-            "type": "cppdbg",
-            "request": "launch",
-            "program": "${workspaceFolder}/examples/cifar10",
-            "args": [],
-            "stopAtEntry": true,
-            "cwd": "${workspaceFolder}",
-            "environment": [],
-            "externalConsole": false,
-            "MIMode": "gdb",
-            "setupCommands": [
-                {
-                    "description": "Enable pretty-printing for gdb",
-                    "text": "-enable-pretty-printing",
-                    "ignoreFailures": true
-                }
-            ]
-        }
-    ]
-}
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
deleted file mode 100644
index 313b24c..0000000
--- a/.vscode/tasks.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-    // See https://go.microsoft.com/fwlink/?LinkId=733558
-    // for the documentation about the tasks.json format
-    "version": "2.0.0",
-    "tasks": [
-        {
-            "label": "make",
-            "type": "shell",
-            "command": "make",
-            // use options.cwd property if the Makefile is not in the project root ${workspaceRoot} dir
-            "options": {
-                "cwd": "${workspaceRoot}"
-            },
-            // start the build without prompting for task selection, use "group": "build" otherwise
-            "group": {
-                "kind": "build",
-                "isDefault": true
-            },
-            "presentation": {
-                "echo": true,
-                "reveal": "always",
-                "focus": false,
-                "panel": "shared"
-            },
-            // arg passing example: in this case is executed make QUIET=0
-            "args": ["QUIET=0"],
-            // Use the standard less compilation problem matcher.
-            "problemMatcher": {
-                "base": "$gcc",
-                "fileLocation": [ "relative", "${workspaceRoot}" ]
-            }
-        }
-    ]
-}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21c8fb8..8b93a33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,18 +8,23 @@ IF(NOT TF_DIR)
     SET(TF_DIR "../tensorflow" CACHE STRING "TensorFlow source directory")
 ENDIF()
 
-GET_FILENAME_COMPONENT(tf_fullpath ${TF_DIR} REALPATH)
+GET_FILENAME_COMPONENT(TF_ABSPATH ${TF_DIR} REALPATH)
 
 IF(NOT GET_TF_SRC)
-    if(EXISTS "${tf_fullpath}")
-        SET(TFL_SRC ${TF_DIR}/tensorflow/lite)
+    if(EXISTS "${TF_ABSPATH}")
+        SET(TFL_SRC ${TF_ABSPATH}/tensorflow/lite)
         SET(TFLM_SRC ${TFL_SRC}/micro)
         SET(TFLMD_SRC ${TFLM_SRC}/tools/make/downloads)
         SET(TF_INCS
-        ${TF_DIR}
-        ${TFLMD_SRC}/flatbuffers/include
-        ${TFLMD_SRC}/ruy
+            ${TF_ABSPATH}
+            ${TFLMD_SRC}/flatbuffers/include
+            ${TFLMD_SRC}/ruy
         )
+        IF(WIN32)
+            SET(TF_LIB ${TFLM_SRC}/tools/make/gen/windows_x86_64/lib/libtensorflow-microlite.a)
+        ELSE()
+            SET(TF_LIB ${TFLM_SRC}/tools/make/gen/linux_x86_64/lib/libtensorflow-microlite.a)
+        ENDIF()
     ELSE()
         MESSAGE(FATAL_ERROR "\
 No valid TensorFlow source directory provided, default path \
@@ -32,10 +37,9 @@ ELSE()
     SET(TF_INCS
         ${TFLite_INCLUDE_DIRS}
         )
+    SET(TF_LIB tensorflow-microlite)
 ENDIF()
 
-SET(TF_LIB tensorflow-microlite)
-
 SET(COMPILER_HEADERS
     ${PROJECT_SOURCE_DIR}/src/CodeWriter.h
     ${PROJECT_SOURCE_DIR}/src/Compiler.h
@@ -43,6 +47,8 @@ SET(COMPILER_HEADERS
     ${PROJECT_SOURCE_DIR}/src/MemMap.h
     ${PROJECT_SOURCE_DIR}/src/RecordAllocations.h
     ${PROJECT_SOURCE_DIR}/src/TypeToString.h
+    ${PROJECT_SOURCE_DIR}/src/BuiltinAllocations.h
+    ${PROJECT_SOURCE_DIR}/src/ModelInfo.h
     )
 
 SET(COMPILER_SRCS
@@ -52,6 +58,7 @@ SET(COMPILER_SRCS
     ${PROJECT_SOURCE_DIR}/src/MemMap.cc
     ${PROJECT_SOURCE_DIR}/src/RecordAllocations.cc
     ${PROJECT_SOURCE_DIR}/src/TypeToString.cc
+    ${PROJECT_SOURCE_DIR}/src/BuiltinAllocations.cc
     ${PROJECT_SOURCE_DIR}/src/main.cc
     )
 
@@ -63,12 +70,6 @@ TARGET_INCLUDE_DIRECTORIES(${PROJECT_NAME} PUBLIC
     ${TF_INCS}
     )
 
-IF(WIN32)
-    TARGET_LINK_DIRECTORIES(${PROJECT_NAME} PUBLIC ${TFLM_SRC}/tools/make/gen/windows_x86_64/lib)
-ELSE()
-    TARGET_LINK_DIRECTORIES(${PROJECT_NAME} PUBLIC ${TFLM_SRC}/tools/make/gen/linux_x86_64/lib)
-ENDIF()
-
 TARGET_LINK_LIBRARIES(${PROJECT_NAME} PUBLIC ${TF_LIB})
 
 TARGET_COMPILE_DEFINITIONS(${PROJECT_NAME} PUBLIC
diff --git a/Makefile b/Makefile
index fcb7d5b..721cb02 100644
--- a/Makefile
+++ b/Makefile
@@ -1,20 +1,22 @@
 TF_DIR=../tensorflow
 include common.mk
 
-.PHONY: tflite all
+.PHONY: tflite all 
 
-all: compiler examples
+all: compiler$(EXE_SUFFIX) examples
+
+$(TF_MICROLITE_LIB): tflite
 
 tflite:
 	$(MAKE) -C $(TF_DIR) -f tensorflow/lite/micro/tools/make/Makefile microlite
 
-COMPILER_OBJS = src/main.o src/Compiler.o src/CodeWriter.o src/TypeToString.o src/RecordAllocations.o src/MemMap.o src/CustomOperators.o
+COMPILER_OBJS = src/main.o src/Compiler.o src/CodeWriter.o src/TypeToString.o src/RecordAllocations.o src/MemMap.o src/CustomOperators.o src/BuiltinAllocations.o
 
-compiler: $(COMPILER_OBJS) tflite
-	$(CXX) $(LDOPTS) -o $@ $(COMPILER_OBJS) $(LIBS)
+compiler$(EXE_SUFFIX): $(COMPILER_OBJS) $(TF_MICROLITE_LIB)
+	$(CXX) $(CXXFLAGS) $(LDOPTS) -o $@ $(COMPILER_OBJS) $(LIBS)
 
 clean: clean-compiler clean-examples
-	$(MAKE) -C $(TF_DIR) -f tensorflow/lite/micro/tools/make/makefile clean
+	$(MAKE) -C $(TF_DIR) -f tensorflow/lite/micro/tools/make/Makefile clean
 
 FORMAT_FILES := $(shell find src -regex '.*\(h\|cpp\)')
 
@@ -22,12 +24,19 @@ format:
 	clang-format -i $(FORMAT_FILES)
 
 .PHONY: examples clean-examples clean-compiler
-examples:
-	cd examples && $(MAKE)
+
+examples: tflite
+	$(MAKE) -C examples all
+
+run_examples: tflite
+	$(MAKE) -C examples run_all
+ 
+regenerate: compiler$(EXE_SUFFIX)
+	$(MAKE) -C examples regenerate
 
 clean-examples:
 	$(MAKE) -C examples clean
 
 clean-compiler:
-	$(RM) src/*.o compiler
+	$(RM) src/*.o compiler$(EXE_SUFFIX)
 	
diff --git a/README.md b/README.md
index 5908636..74d3aee 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ CMake using the option `TF_TAG`.
 e.g.
 
 ``` bash
-cmake -DGET_TF_SRC=ON TF_TAG=v2.2.0 ..
+cmake -DGET_TF_SRC=ON -DTF_TAG=v2.2.0 ..
 ```
 
 Similarly a Git commit hash can be provided using `TF_COMMIT`. Note that
@@ -57,7 +57,7 @@ Similarly a Git commit hash can be provided using `TF_COMMIT`. Note that
 e.g.
 
 ```bash
-cmake -DGET_TF_SRC=ON TF_COMMIT=0fecf6f89fd7bacc1ec4213b946a254e885b82ac ..
+cmake -DGET_TF_SRC=ON -DTF_COMMIT=0fecf6f89fd7bacc1ec4213b946a254e885b82ac ..
 ```
 
 To checkout a different TensorFlow code base without clearing the CMake cache
@@ -67,7 +67,7 @@ source to be checked-out again.
 e.g.
 
 ```bash
-cmake -DGET_TF_SRC=ON -DTF_RECACHE=ON TF_COMMIT=0fecf6f89fd7bacc1ec4213b946a254e885b82ac ..
+cmake -DGET_TF_SRC=ON -DTF_RECACHE=ON -DTF_COMMIT=0fecf6f89fd7bacc1ec4213b946a254e885b82ac ..
 ```
 
 ## Providing TensorFlow Source Manually
@@ -79,7 +79,7 @@ providing the argument `TF_DIR`.
 e.g.
 
 ``` bash
-cmake -DTF_DIR=../../my_tf_source ..
+cmake -DTF_DIR=../my_tensorflow ..
 ```
 
 ## Additional Targets
@@ -117,7 +117,7 @@ make format
     ./compiler hello_world.tflite hello_compiled.cpp hello_
     ```
 
-- for a quick view into the generated code see [`compiled_hello.cpp`](https://github.com/cpetig/tflite_micro_compiler/blob/master/examples/compiled_hello.cpp)
+- for a quick view into the generated code see [`compiled_hello_world.cc`](https://github.com/cpetig/tflite_micro_compiler/blob/master/examples/compiled_hello_world.cc)
 
   You can compare calling into interpreter and compiled code between [`hello_world.cc`](https://github.com/cpetig/tflite_micro_compiler/blob/master/examples/hello_world.cc)
   and [`hello_world2.cc`](https://github.com/cpetig/tflite_micro_compiler/blob/master/examples/hello_world2.cc)
diff --git a/common.mk b/common.mk
index fe89325..1ff3e61 100644
--- a/common.mk
+++ b/common.mk
@@ -1,17 +1,23 @@
-CXXFLAGS=-g -std=c++14 -DTF_LITE_STATIC_MEMORY -DNDEBUG -O3 -DTF_LITE_DISABLE_X86_NEON -DSUFFICIENT_ARENA_SIZE=128\*1024\*1024 \
+CXXFLAGS=-g -std=c++14 -DTF_LITE_STATIC_MEMORY -DDEBUG -O1 -DTF_LITE_DISABLE_X86_NEON -DSUFFICIENT_ARENA_SIZE=128\*1024\*1024 \
 	-I$(TF_DIR) -I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/ \
 	-I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/gemmlowp \
 	-I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/flatbuffers/include \
 	-I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/ruy \
-	-I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/kissfft
-
-LDOPTS=-L $(TF_DIR)/tensorflow/lite/micro/tools/make/gen/$(HOST_OS_BUILD)/lib
+	-I$(TF_DIR)/tensorflow/lite/micro/tools/make/downloads/kissfft 
 
+ifeq ($(BUILD_TYPE),debug)
+  HOST_OS_BUILD:=$(HOST_OS_BUILD)_debug
+endif
+TF_MICROLITE_LIBDIR=$(TF_DIR)/tensorflow/lite/micro/tools/make/gen/$(HOST_OS_BUILD)/lib
+TF_MICROLITE_LIB=$(TF_MICROLITE_LIBDIR)/libtensorflow-microlite.a
 
 ifeq ($(OS),Windows_NT)
-  LIBS=-ltensorflow-microlite 
-  HOST_OS_BUILD=windows_x86_64
+  LIBS=$(TF_MICROLITE_LIB)
+  HOST_OS_BUILD := windows_x86_64
+  EXE_SUFFIX := .exe
 else
-  LIBS=-ltensorflow-microlite -ldl
-  HOST_OS_BUILD=linux_x86_64
+  LIBS=$(TF_MICROLITE_LIB) -ldl
+  HOST_OS_BUILD := linux_x86_64
+   EXE_SUFFIX :=
 endif
+
diff --git a/src/BuiltinAllocations.cc b/src/BuiltinAllocations.cc
new file mode 100644
index 0000000..4f9d03d
--- /dev/null
+++ b/src/BuiltinAllocations.cc
@@ -0,0 +1,136 @@
+#include "BuiltinAllocations.h"
+
+#include <string>
+#include <sstream>
+
+#include "TypeToString.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace {
+
+class AllocatorToGetLastAllocSize : public tflite::BuiltinDataAllocator {
+ public:
+
+  void* Allocate(size_t size, size_t alignment_hint) override {
+    lastAllocSize = size;
+    allocated_blocks.push_back(std::make_unique<uint8_t []>(size));
+    return reinterpret_cast<void *>(allocated_blocks.back().get());
+  }
+
+  void Deallocate(void* data) override {
+  }
+
+
+  size_t GetLastAllocSize() { return lastAllocSize; }
+
+ private:
+  std::vector<std::unique_ptr<uint8_t []>>  allocated_blocks;
+  size_t lastAllocSize = 0;
+};
+
+}  // namespace
+
+namespace tflmc {
+namespace BuiltinAllocations {
+
+size_t GetBuiltinDataSize(tflite::BuiltinOperator opType,
+                          const tflite::SubGraph* subgraph,
+                          tflite::ErrorReporter &errReporter) {
+  // There seems to be no simple query function for this, so tickle the
+  // information out of the parse function.
+  auto dummyOp = subgraph->operators()->Get(0);
+  AllocatorToGetLastAllocSize allocator;
+  void* outData = nullptr;
+  if (tflite::ParseOpData(dummyOp, opType, &errReporter, &allocator,
+                          &outData) != kTfLiteOk) {
+    throw std::runtime_error("ERROR: Unable to use tflite::ParseOpData to extract the BuiltinDataSize!\n"
+		             "tensorflow/lite/core/api/flatbuffer_conversions.cc needs a patch to support this feature...");
+  }
+
+  return allocator.GetLastAllocSize();
+}
+
+std::pair<std::string, std::string> getBuiltinStrings(tflite::BuiltinOperator op,
+                                                      const void* data) {
+  using namespace tflmc;
+  std::stringstream builtinOptionsName, builtinOptionsStruct;
+  switch (op) {
+    case tflite::BuiltinOperator_CONV_2D: {
+      builtinOptionsName << "TfLiteConvParams";
+      TfLiteConvParams const* p = (TfLiteConvParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->padding) << ", " << p->stride_width << ","
+                           << p->stride_height << ", " << to_string(p->activation) << ", "
+                           << p->dilation_width_factor << "," << p->dilation_height_factor
+                           << " }";
+    } break;
+    case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: {
+      builtinOptionsName << "TfLiteDepthwiseConvParams";
+      TfLiteDepthwiseConvParams const* p =
+          (TfLiteDepthwiseConvParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->padding) << ", " << p->stride_width << ","
+                           << p->stride_height << ", " << p->depth_multiplier << ", "
+                           << to_string(p->activation) << ", " << p->dilation_width_factor
+                           << "," << p->dilation_height_factor << " }";
+    } break;
+    case tflite::BuiltinOperator_FULLY_CONNECTED: {
+      builtinOptionsName << "TfLiteFullyConnectedParams";
+      TfLiteFullyConnectedParams const* p =
+          (TfLiteFullyConnectedParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->activation) << ", " << to_string(p->weights_format)
+                           << ", " << p->keep_num_dims << ", " << p->asymmetric_quantize_inputs
+                           << " }";
+    } break;
+    case tflite::BuiltinOperator_MAX_POOL_2D:
+    case tflite::BuiltinOperator_AVERAGE_POOL_2D: {
+      builtinOptionsName << "TfLitePoolParams";
+      TfLitePoolParams const* p = (TfLitePoolParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->padding) << ", " << p->stride_width << ","
+                           << p->stride_height << ", " << p->filter_width << ","
+                           << p->filter_height << ", " << to_string(p->activation) << ", { "
+                           << to_string(p->computed.padding) << " } }";
+    } break;
+    case tflite::BuiltinOperator_RESHAPE: {
+      builtinOptionsName << "TfLiteReshapeParams";
+      builtinOptionsStruct << "{ {";
+      TfLiteReshapeParams const* p = (TfLiteReshapeParams const*)data;
+      for (uint32_t i = 0; i < TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT; ++i)
+        builtinOptionsStruct << p->shape[i] << ", ";
+      builtinOptionsStruct << "}, " << p->num_dimensions << " }";
+    } break;
+    case tflite::BuiltinOperator_SOFTMAX: {
+      builtinOptionsName << "TfLiteSoftmaxParams";
+      TfLiteSoftmaxParams const* p = (TfLiteSoftmaxParams const*)data;
+      builtinOptionsStruct << "{ " << p->beta << " }";
+    } break;
+    case tflite::BuiltinOperator_ADD: {
+      builtinOptionsName << "TfLiteAddParams";
+      TfLiteAddParams const* p = (TfLiteAddParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->activation) << ", "
+                                   << p->pot_scale_int16 << " }";
+    } break;
+    case tflite::BuiltinOperator_MUL: {
+      builtinOptionsName << "TfLiteMulParams";
+      TfLiteMulParams const* p = (TfLiteMulParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->activation) << " }";
+    } break;
+    case tflite::BuiltinOperator_SUB: {
+      builtinOptionsName << "TfLiteSubParams";
+      TfLiteSubParams const* p = (TfLiteSubParams const*)data;
+      builtinOptionsStruct << "{ " << to_string(p->activation) << ", "
+                                   << p->pot_scale_int16 << " }";
+    } break;
+    case tflite::BuiltinOperator_CONCATENATION: {
+      builtinOptionsName << "TfLiteConcatenationParams";
+      TfLiteConcatenationParams const* p =
+          (TfLiteConcatenationParams const*)data;
+      builtinOptionsStruct << "{ " << p->axis << ", " << to_string(p->activation) << " }";
+    } break;
+    default: {
+    } break;
+  }
+  return std::make_pair(builtinOptionsName.str(), builtinOptionsStruct.str());
+}
+
+}  // namespace BuiltinAllocations
+}  // namespace tflmc
diff --git a/src/BuiltinAllocations.h b/src/BuiltinAllocations.h
new file mode 100644
index 0000000..73bcb62
--- /dev/null
+++ b/src/BuiltinAllocations.h
@@ -0,0 +1,20 @@
+#ifndef TFLMCOMPILER_BUILTIN_ALLOCATIONS_H
+#define TFLMCOMPILER_BUILTIN_ALLOCATIONS_H
+
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflmc {
+namespace BuiltinAllocations {
+
+size_t GetBuiltinDataSize(tflite::BuiltinOperator opType,
+                          const tflite::SubGraph* subgraph,
+                          tflite::ErrorReporter &errReporter);
+
+std::pair<std::string, std::string> getBuiltinStrings(tflite::BuiltinOperator op,
+                                                      const void* data);
+
+}  // namespace BuiltinAllocations
+}  // namespace tflmc
+
+#endif
diff --git a/src/CodeWriter.cc b/src/CodeWriter.cc
index 564c0b1..39c4ff3 100644
--- a/src/CodeWriter.cc
+++ b/src/CodeWriter.cc
@@ -3,43 +3,20 @@
 #include <ctime>
 #include <iomanip>
 
-#include "TypeToString.h"
+#include "BuiltinAllocations.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/micro/micro_error_reporter.h"
-
-namespace {
-
-class AllocatorToGetLastAllocSize : public tflite::BuiltinDataAllocator {
- public:
-  void* Allocate(size_t size, size_t alignment_hint) override {
-    lastAllocSize = size;
-    return malloc(size);
-  }
-  void Deallocate(void* data) override { free(data); }
-  size_t GetLastAllocSize() { return lastAllocSize; }
-
- private:
-  size_t lastAllocSize = 0;
-};
-size_t GetBuiltinDataSize(tflite::BuiltinOperator opType,
-                          const tflite::SubGraph* subgraph) {
-  // There seems to be no simple query function for this, so tickle the
-  // information out of the parse function.
-  auto dummyOp = subgraph->operators()->Get(0);
-  tflite::MicroErrorReporter errReporter;
-  AllocatorToGetLastAllocSize allocator;
-  void* outData = nullptr;
-  if (tflite::ParseOpData(dummyOp, opType, &errReporter, &allocator,
-                          &outData) == kTfLiteOk)
-    free(outData);
-  return allocator.GetLastAllocSize();
-}
-
-}  // namespace
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 tflmc::CodeWriter::CodeWriter(std::ostream& out,
-                              const tflite::SubGraph* subgraph)
-    : out_(out), subgraph_(subgraph) {
+                              const tflite::SubGraph* subgraph,
+                              tflite::ErrorReporter &err_reporter
+                              )
+    : out_(out), subgraph_(subgraph)
+    , err_reporter_(err_reporter)
+    , init_data_usage_(0)
+    , uninit_data_usage_(0)
+    , const_data_usage_(0)
+{
   // Setup stream: Print booleans as string:
   out_ << std::boolalpha;
   // Print floats with precision that is sufficient for exact back-conversion:
@@ -62,97 +39,58 @@ void tflmc::CodeWriter::writeBuiltin(tflite::BuiltinOperator op,
     return;
   }
   out_ << "const ";
-  switch (op) {
-    case tflite::BuiltinOperator_CONV_2D: {
-      out_ << "TfLiteConvParams " << name << " = { ";
-      TfLiteConvParams const* p = (TfLiteConvParams const*)data;
-      out_ << to_string(p->padding) << ", " << p->stride_width << ","
-           << p->stride_height << ", " << to_string(p->activation) << ", "
-           << p->dilation_width_factor << "," << p->dilation_height_factor
-           << " };";
-    } break;
-    case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: {
-      out_ << "TfLiteDepthwiseConvParams " << name << " = { ";
-      TfLiteDepthwiseConvParams const* p =
-          (TfLiteDepthwiseConvParams const*)data;
-      out_ << to_string(p->padding) << ", " << p->stride_width << ","
-           << p->stride_height << ", " << p->depth_multiplier << ", "
-           << to_string(p->activation) << ", " << p->dilation_width_factor
-           << "," << p->dilation_height_factor << " };";
-    } break;
-    case tflite::BuiltinOperator_FULLY_CONNECTED: {
-      out_ << "TfLiteFullyConnectedParams " << name << " = { ";
-      TfLiteFullyConnectedParams const* p =
-          (TfLiteFullyConnectedParams const*)data;
-      out_ << to_string(p->activation) << ", " << to_string(p->weights_format)
-           << ", " << p->keep_num_dims << ", " << p->asymmetric_quantize_inputs
-           << " };";
-    } break;
-    case tflite::BuiltinOperator_MAX_POOL_2D:
-    case tflite::BuiltinOperator_AVERAGE_POOL_2D: {
-      out_ << "TfLitePoolParams " << name << " = { ";
-      TfLitePoolParams const* p = (TfLitePoolParams const*)data;
-      out_ << to_string(p->padding) << ", " << p->stride_width << ","
-           << p->stride_height << ", " << p->filter_width << ","
-           << p->filter_height << ", " << to_string(p->activation) << ", { "
-           << to_string(p->computed.padding) << " } };";
-    } break;
-    case tflite::BuiltinOperator_RESHAPE: {
-      out_ << "TfLiteReshapeParams " << name << " = { { ";
-      TfLiteReshapeParams const* p = (TfLiteReshapeParams const*)data;
-      for (uint32_t i = 0; i < TFLITE_RESHAPE_PARAMS_MAX_DIMENSION_COUNT; ++i)
-        out_ << p->shape[i] << ", ";
-      out_ << "}, " << p->num_dimensions << " };";
-    } break;
-    case tflite::BuiltinOperator_SOFTMAX: {
-      out_ << "TfLiteSoftmaxParams " << name << " = { ";
-      TfLiteSoftmaxParams const* p = (TfLiteSoftmaxParams const*)data;
-      out_ << p->beta << " };";
-    } break;
-    case tflite::BuiltinOperator_ADD: {
-      out_ << "TfLiteAddParams " << name << " = { ";
-      TfLiteAddParams const* p = (TfLiteAddParams const*)data;
-      out_ << to_string(p->activation) << " };";
-    } break;
-    case tflite::BuiltinOperator_MUL: {
-      out_ << "TfLiteMulParams " << name << " = { ";
-      TfLiteMulParams const* p = (TfLiteMulParams const*)data;
-      out_ << to_string(p->activation) << " };";
-    } break;
-    case tflite::BuiltinOperator_SUB: {
-      out_ << "TfLiteSubParams " << name << " = { ";
-      TfLiteSubParams const* p = (TfLiteSubParams const*)data;
-      out_ << to_string(p->activation) << " };";
-    } break;
-    case tflite::BuiltinOperator_CONCATENATION: {
-      out_ << "TfLiteConcatenationParams " << name << " = { ";
-      TfLiteConcatenationParams const* p =
-          (TfLiteConcatenationParams const*)data;
-      out_ << p->axis << ", " << to_string(p->activation) << " };";
-    } break;
-    default: {
-      size_t datalen = GetBuiltinDataSize(op, subgraph_);
-      uint32_t alignment = datalen >= 4 ? 4 : datalen >= 2 ? 2 : 1;
-      out_ << "ALIGN(" << alignment << ") uint8_t " << name << "[" << datalen
-           << "] = { ";
-      for (uint32_t i = 0; i < datalen; ++i)
-        out_ << int(((uint8_t const*)data)[i]) << ", ";
-      out_ << " }; /* op type " << int(op) << "="
-           << tflite::EnumNameBuiltinOperator(op) << " */";
-    } break;
+  auto builtin_strings = BuiltinAllocations::getBuiltinStrings(op, data);
+  if (!builtin_strings.first.empty() && !builtin_strings.first.empty()) {
+    out_ << builtin_strings.first << " " << name << " = "
+         << builtin_strings.second << ";";
+  } else {
+    size_t datalen = BuiltinAllocations::GetBuiltinDataSize(op, subgraph_, err_reporter_);
+    uint32_t alignment = datalen >= 4 ? 4 : datalen >= 2 ? 2 : 1;
+    out_ << "ALIGN(" << alignment << ") uint8_t " << name << "[" << datalen
+         << "] = { ";
+    for (uint32_t i = 0; i < datalen; ++i)
+      out_ << int(((uint8_t const*)data)[i]) << ", ";
+    out_ << " }; /* op type " << int(op) << "="
+         << tflite::EnumNameBuiltinOperator(op) << " */";
   }
   out_ << '\n';
 }
 
+
+                    
+void tflmc::CodeWriter::writeCustom(uint8_t const *opdata, size_t node_i, size_t opdata_size) {
+    out_ << "uint8_t ALIGN(4) opdata" + std::to_string(node_i) << "["
+        << opdata_size << "] = { ";
+    for (size_t j = 0; j < opdata_size; ++j)
+      out_ << int(opdata[j]) << ", ";
+    out_ << " }; /* custom_initial_data */\n";
+    const_data_usage_ += opdata_size;
+    init_data_usage_ += opdata_size;
+}
+
+template<class TFArray>
+size_t writeTfArray( std::ostream &os, const TFArray *tfarray, const std::string &name, const char * suffix, const char *data_type_id)
+{
+    os << "const TfArray<" 
+          << tfarray->size << ", " 
+          << data_type_id << "> " 
+       << name << suffix
+       << " = { " << tfarray->size << ", { ";
+    for (int i = 0; i < tfarray->size; i++) {
+      os << tfarray->data[i] << ", ";
+    }
+    os << "} };\n";
+    return tfarray->size+1;
+}
+
 void tflmc::CodeWriter::writeIntArray(const TfLiteIntArray& arr,
                                       const std::string& name) {
   if (arr.size == 0) {
     out_ << "const int " << name << " = 0; /* empty TfLiteIntArray */\n";
+    const_data_usage_ += sizeof(int);
   } else {
-    out_ << "const TfArray<" << arr.size << ", int> " << name << " = { "
-         << arr.size << ", { ";
-    writeIntArrayData(arr);
-    out_ << " } };\n";
+    auto arr_size = writeTfArray(out_, &arr, name, "", "int");
+    const_data_usage_ += sizeof(int)*arr_size;
   }
 }
 
@@ -168,14 +106,16 @@ void tflmc::CodeWriter::writeIntArrayData(const TfLiteIntArray& arr) {
 // outputting int8_t as a character is not what we intend here, we want to see
 // the value, so we introduce printT
 template <class T, class printT>
-static void dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
+static size_t dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
                                  const std::string& tname,
                                  const std::string& name) {
+
+  size_t mem_size; 
   if (t.dims->size == 0) {  // special case 0 dimensions, we output an array to
                             // avoid distinction from >0 dimension at every use
     out_ << "const " << tname << " " << name << "[1] = { "
          << (printT)(tflite::GetTensorData<T>(&t)[0]) << " };\n";
-    return;
+    return sizeof(T);
   }
 
   uint32_t alignment = t.bytes >= 8 ? 8 : t.bytes >= 4 ? 4 : 2;
@@ -194,6 +134,7 @@ static void dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
 
   if (serialized_elts != nominal_elts) {
     out_ << serialized_elts << " /* PACKED ";
+
   }
 
   out_ << t.dims->data[0];
@@ -202,13 +143,15 @@ static void dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
     out_ << " */";
   }
   out_ << "] = { ";
-  if (t.dims->size == 1 || serialized_elts != nominal_elts) {
-    // one dimension/packed: 10 per line of data
-    for (int i = 0; i < serialized_elts; ++i) {
-      if (i % 10 == 0) out_ << "\n    ";
-      out_ << (printT)(tflite::GetTensorData<T>(&t)[i]) << ", ";
+  if (t.dims->size == 1 || serialized_elts != nominal_elts)  // one dimension/packed: 10 per line of data
+  {
+    for (size_t i = 0; i < serialized_elts; ++i) {
+        if (i%10 == 0)
+          out_ << "\n    ";
+      out_ << (printT)(tflite::GetTensorData<T>(&t)[i]) << ", "; 
     }
     out_ << "\n};\n";
+    mem_size = serialized_elts*sizeof(T);
   } else if (t.dims->size == 2) {
     // two dimensions: Inner dimension is one line
     for (int i = 0; i < t.dims->data[0]; ++i) {
@@ -218,6 +161,7 @@ static void dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
              << ", ";
     }
     out_ << "\n};\n";
+    mem_size = nominal_elts*sizeof(T);
   } else {
     // More dimensions: Inner two dimensions per line (space between two
     // middle elements)
@@ -249,12 +193,14 @@ static void dump_tensor_contents(std::ostream& out_, const TfLiteTensor& t,
       }
     }
     out_ << "\n};\n";
+    mem_size = nominal_elts*sizeof(T);
   }
+  return mem_size;
 }
 
 #define DUMP_TENSOR2(TfType, CType, PrintType)                     \
   case TfType:                                                     \
-    dump_tensor_contents<CType, PrintType>(out_, t, #CType, name); \
+    const_data_usage_ += dump_tensor_contents<CType, PrintType>(out_, t, #CType, name); \
     break
 
 void tflmc::CodeWriter::writeTensor(const TfLiteTensor& t,
@@ -276,42 +222,72 @@ void tflmc::CodeWriter::writeTensor(const TfLiteTensor& t,
       for (size_t i = 0; i < t.bytes; i++)
         out_ << int((uint8_t)t.data.raw_const[i]) << ",";
       out_ << " };\n";
+      const_data_usage_ += t.bytes;
     } break;
   }
 }
 
+
+static void writeAffineQuantizationFields(std::ostream &out, const std::string& name, TfLiteAffineQuantization const *aq) {
+
+  out << "{ "
+         << "(TfLiteFloatArray*)&" << name << "_scale, "
+         << "(TfLiteIntArray*)&" << name << "_zero, " << aq->quantized_dimension
+         << " }";
+}
+
+
+#if SUPPORT_CUSTOM_QUANT
+static void writeQuantizationDetails(
+    std::ostream& out, const TfLiteCustomSub8BitPackingDetails* sub8_details,
+    const std::string& name) {
+    out << "const TfLiteCustomSub8BitPackingDetails " << name << " = { ";
+    out << static_cast<unsigned>(sub8_details->bits_per_item) << ", ";
+    out << static_cast<unsigned>(sub8_details->container_bits) << ", ";
+    out << static_cast<unsigned>(sub8_details->packed_minor_dims) << ", ";
+    out << static_cast<unsigned>(sub8_details->sparsity_coding) << ", ";
+    out << "{}";
+    out << "};\n";
+}
+#endif  // SUPPORT_CUSTOM_QUANT
+
 void tflmc::CodeWriter::writeQuantization(const TfLiteQuantization& q,
                                           const std::string& name) {
   if (q.type == kTfLiteAffineQuantization) {
     auto aq = (TfLiteAffineQuantization const*)q.params;
-    out_ << "const TfArray<" << aq->scale->size << ", float> " << name
-         << "_scale = { " << aq->scale->size << ", { ";
-    for (int i = 0; i < aq->scale->size; i++) {
-      out_ << aq->scale->data[i] << ", ";
-    }
-    out_ << "} };\n";
-    out_ << "const TfArray<" << aq->zero_point->size << ", int> " << name
-         << "_zero = { " << aq->zero_point->size << ", { ";
-    writeIntArrayData(*aq->zero_point);
-    out_ << " } };\n";
-    out_ << "const TfLiteAffineQuantization " << name << " = { "
-         << "(TfLiteFloatArray*)&" << name << "_scale, "
-         << "(TfLiteIntArray*)&" << name << "_zero, " << aq->quantized_dimension
-         << " };\n";
+    auto scale_size = writeTfArray(out_, aq->scale, name, "_scale", "float");
+    auto zp_size = writeTfArray(out_,  aq->zero_point, name, "_zero", "int");
+    const_data_usage_ += scale_size * sizeof(float)  + zp_size*sizeof(int);
+    out_ << "const TfLiteAffineQuantization " << name << " = ";
+    writeAffineQuantizationFields(out_, name, aq);
+    out_ << ";\n";
+    const_data_usage_ += sizeof(TfLiteAffineQuantization);
+#if SUPPORT_CUSTOM_QUANT
+  } else if (q.type == kTfLitePackedAffineQuantization) {
+    auto paq = (TfLitePackedAffineQuantization const*)q.params;
+    writeQuantizationDetails(out_, paq->custom_sub8bit_packing, name + "_packing");
+    const_data_usage_ += sizeof(TfLiteCustomSub8BitPackingDetails);
+    auto aq = &paq->affine;
+    auto scale_size = writeTfArray(out_, aq->scale, name, "_scale", "float");
+    auto zp_size = writeTfArray(out_,  aq->zero_point, name, "_zero", "int");
+    const_data_usage_ += scale_size * sizeof(float)  + zp_size*sizeof(int);
+    out_ << "const TfLitePackedAffineQuantization " << name << " = { ";
+    writeAffineQuantizationFields(out_, name, aq);
+    out_ << ", &" <<  name + "_packing" << "};\n";
+    const_data_usage_ += sizeof(kTfLitePackedAffineQuantization);
+#endif  // SUPPORT_CUSTOM_QUANT
   }
 }
 
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION == 100
-void tflmc::CodeWriter::writeQuantizationDetails(const TfLiteQuantization& q,
-                                                 const std::string& name) {
-  if (q.details.type == kTfLiteSub8BitPackedUniformDetail) {
-    out_ << "const TfLiteCustomSub8BitPackingDetails " << name << " = { ";
-    auto sub8_details = q.details.data.custom_sub8bit_packing;
-    out_ << static_cast<unsigned>(sub8_details->bits_per_item) << ", ";
-    out_ << static_cast<unsigned>(sub8_details->container_bits) << ", ";
-    out_ << static_cast<unsigned>(sub8_details->packed_minor_dims) << ", ";
-    out_ << "{}";
-    out_ << "};\n";
-  }
+void tflmc::CodeWriter::writeTensorArena(size_t tensor_arena_size)
+{
+  out_ << R"(
+constexpr int kTensorArenaSize = )"
+     << tensor_arena_size << R"(;
+uint8_t tensor_arena[kTensorArenaSize] ALIGN(16);
+)";
+  uninit_data_usage_ += tensor_arena_size;
 }
-#endif
+
+
+  
\ No newline at end of file
diff --git a/src/CodeWriter.h b/src/CodeWriter.h
index e8c69b7..f08a22d 100644
--- a/src/CodeWriter.h
+++ b/src/CodeWriter.h
@@ -2,18 +2,25 @@
 #define TFLMCOMPILER_CODEWRITER_H
 
 #include <iostream>
-
 #include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflmc {
 
 // Helper functions for top-level code generation.
 class CodeWriter {
  public:
-  CodeWriter(std::ostream &out, const tflite::SubGraph *subgraph);
+  CodeWriter(std::ostream &out, const tflite::SubGraph *subgraph,
+             tflite::ErrorReporter &errReporter);
 
   void writeBuiltin(tflite::BuiltinOperator op, const void *data,
                     const std::string &name);
+                    
+  void writeCustom(uint8_t const *opdata, size_t node_i, size_t opdata_size);
+
+  std::pair<std::string, std::string> getBuiltinStrings(tflite::BuiltinOperator op,
+                                                        const void* data);
 
   // Write IntArray with variable declaration.
   void writeIntArray(const TfLiteIntArray &arr, const std::string &name);
@@ -24,10 +31,41 @@ class CodeWriter {
 
   void writeQuantization(const TfLiteQuantization &q, const std::string &name);
 
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION == 100
-  void writeQuantizationDetails(const TfLiteQuantization &q,
-                                const std::string &name);
-#endif
+  void writeTensorArena(size_t tensor_arena_size);
+
+template<class Container>
+void writeArray(const Container &container, size_t elt_size, bool is_const,
+   const char *decl, const char *name ) {
+
+  out_ << decl << ' ' << name << R"([] = {
+)";
+  size_t elts = 0;
+  for (auto &e : container) {
+    out_ << std::to_string(e) << ",";
+    ++elts;
+    if (elts % 10 == 0) {
+      out_ << "\n";
+    } else { 
+      out_ << " ";
+    }
+  }
+  // To suppress warnings add dummy element if no scratch bufs
+  if (container.empty()) {
+    out_ << "0 // dummy to avoid empty vector";
+  }
+  out_ << R"(
+};  
+)";
+
+  size_t footprint = elt_size * container.size();
+  if (is_const) {
+    const_data_usage_ += footprint;
+  } else {
+    init_data_usage_ += footprint;
+  }
+}
+
+
 
   template <typename T>
   CodeWriter &operator<<(T &&value) {
@@ -35,9 +73,20 @@ class CodeWriter {
     return *this;
   }
 
+  inline size_t initDataUsage() const { return init_data_usage_; }
+
+  inline size_t uninitDataUsage() const { return uninit_data_usage_; }
+
+  inline size_t constDataUsage() const { return const_data_usage_; }
+
  private:
   std::ostream &out_;
   const tflite::SubGraph *subgraph_ = nullptr;
+  tflite::ErrorReporter &err_reporter_;
+
+  size_t  init_data_usage_;
+  size_t  uninit_data_usage_;
+  size_t  const_data_usage_;
 };
 
 }  // namespace tflmc
diff --git a/src/Compiler.cc b/src/Compiler.cc
index 53fe3e0..82a957a 100644
--- a/src/Compiler.cc
+++ b/src/Compiler.cc
@@ -1,34 +1,217 @@
 
 #include "Compiler.h"
-
 #include <memory>
 #include <fstream>
+#include <sstream>
 #include <regex>
 #include <vector>
+#include <cstdio>
 
 #include "CodeWriter.h"
 #include "CustomOperators.h"
 #include "RecordAllocations.h"
+#include "Options.h"
 #include "TypeToString.h"
-#include "tensorflow/lite/version.h"
+#include "tensorflow/lite/c/common.h"
 
-#ifndef SUFFICIENT_ARENA_SIZE
-#define SUFFICIENT_ARENA_SIZE (128*1024*1024)
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+#include "tflite_u_preint/static_init_support.h"
 #endif
 
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION != 100
-#error "ONLY TF_LITE_PACKED_QUANTIZED_DATA_VERSION Version 100 supported!"
+#ifndef SUFFICIENT_ARENA_SIZE 
+#define SUFFICIENT_ARENA_SIZE (128*1024*1024)
 #endif
+
+#ifndef SUFFICIENT_ARENA_ALIGNMENT 
+#define SUFFICIENT_ARENA_ALIGNMENT (16)
 #endif
 
-bool tflmc::CompileFile(const std::string &modelFileName,
-                        const std::string &outFileName,
+const static int ILLEGAL_IF_EVER_MULTIPLE_SUBGRAPH = 0xdeadbeef;
+
+
+
+namespace tflmc
+{
+
+  /**
+   * @brief Generation of specialized TensorInfo_t POD struct 
+   * 
+   */
+  struct GeneratedTensorInfo {
+
+    struct Full_t{
+      TfLiteType type;
+      void* data;
+      TfLiteIntArray* dims;
+      size_t bytes;
+      TfLiteQuantization quantization;
+      bool is_variable;
+    };
+
+    static std::string generated(bool has_type, bool has_quantization, bool has_is_variable) {
+
+      std::stringstream wr;
+
+      wr << R"(
+struct TensorInfo_t { // subset of TfLiteTensor used for initialization from constant memory
+)";
+      if (has_type) {
+        wr << "  TfLiteType type;\n";
+      }
+      wr << R"(  void* data;
+  TfLiteIntArray* dims;
+  size_t bytes;
+)";
+      if (has_quantization) {
+        wr << "  TfLiteQuantization quantization;\n";
+      }
+      if (has_is_variable) {
+        wr << "  bool is_variable;\n";
+      }
+      wr << "};\n";
+      return wr.str();
+    }
+
+    struct TrailingBoolField {
+      bool a_bool;
+    };
+
+    static size_t size(bool has_type, bool has_quantization, bool has_is_variable) {
+      auto size = sizeof(Full_t);
+      if (!has_type) { size -= sizeof(TfLiteType); }
+      if (!has_quantization)  { size -= sizeof(TfLiteQuantization); }
+      // Dangling bool... prboably more accurate than simply sizeof(bool)
+      // once alignment / packing constraints are accounted for.
+      if (!has_is_variable) { size -= sizeof(TrailingBoolField); }
+      return size;
+    }
+  };
+
+  /**
+   * @brief Generation of specialized NodeInfo_t POD struct
+   * 
+   */
+
+  struct GeneratedNodeInfo {
+
+    enum used_operators_e { DUMMY_OP_INDEX, LAST_OP };
+
+    struct Full_t { 
+      struct TfLiteIntArray* inputs;
+      struct TfLiteIntArray* outputs;
+      void* builtin_data;
+      used_operators_e used_op_index;
+      int custom_initial_data_size;
+    };
+
+
+    static std::string generated(bool has_custom_ops) {
+
+      std::stringstream wr;
+      wr << R"(
+struct NodeInfo_t { // subset of TfLiteNode used for initialization from constant memory
+  struct TfLiteIntArray* inputs;
+  struct TfLiteIntArray* outputs;
+  void* builtin_data;
+  used_operators_e used_op_index;
+  )";
+      if (has_custom_ops) {
+        wr << "  int custom_initial_data_size;\n";
+      }
+      wr << "};\n";
+      return wr.str();
+    }
+
+    static size_t size(bool has_custom_ops) {
+      auto size = sizeof(Full_t);
+      if (!has_custom_ops) size -= sizeof(int);
+      return size;
+    }
+  };
+} // namespace tflmc
+
+static std::vector<int> flat_namespaced_ops({
+    tflite::BuiltinOperator_ADD,
+    tflite::BuiltinOperator_ADD_N,
+    tflite::BuiltinOperator_ASSIGN_VARIABLE,
+    tflite::BuiltinOperator_AVERAGE_POOL_2D,
+    tflite::BuiltinOperator_BATCH_TO_SPACE_ND,
+    tflite::BuiltinOperator_CALL_ONCE,
+    tflite::BuiltinOperator_CAST,
+    tflite::BuiltinOperator_CONV_2D,
+    tflite::BuiltinOperator_CUMSUM,
+    tflite::BuiltinOperator_DEPTH_TO_SPACE,
+    tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+    tflite::BuiltinOperator_DIV,
+    tflite::BuiltinOperator_ELU,
+    tflite::BuiltinOperator_EXP,
+    tflite::BuiltinOperator_EXPAND_DIMS,
+    tflite::BuiltinOperator_FILL,
+    tflite::BuiltinOperator_FLOOR_DIV,
+    tflite::BuiltinOperator_FLOOR_MOD,
+    tflite::BuiltinOperator_FULLY_CONNECTED,
+    tflite::BuiltinOperator_GATHER,
+    tflite::BuiltinOperator_GATHER_ND,
+    tflite::BuiltinOperator_HARD_SWISH,
+    tflite::BuiltinOperator_IF,
+    tflite::BuiltinOperator_L2_POOL_2D,
+    tflite::BuiltinOperator_LEAKY_RELU,
+    tflite::BuiltinOperator_LOG_SOFTMAX,
+    tflite::BuiltinOperator_LOGICAL_AND,
+    tflite::BuiltinOperator_LOGICAL_OR,
+    tflite::BuiltinOperator_LOGISTIC,
+    tflite::BuiltinOperator_MAX_POOL_2D,
+    tflite::BuiltinOperator_MIRROR_PAD,
+    tflite::BuiltinOperator_MUL,
+    tflite::BuiltinOperator_PRELU,
+    tflite::BuiltinOperator_QUANTIZE,
+    tflite::BuiltinOperator_READ_VARIABLE,
+    tflite::BuiltinOperator_RELU,
+    tflite::BuiltinOperator_RELU6,
+    tflite::BuiltinOperator_RESIZE_BILINEAR,
+    tflite::BuiltinOperator_SHAPE,
+    tflite::BuiltinOperator_SLICE,
+    tflite::BuiltinOperator_SOFTMAX,
+    tflite::BuiltinOperator_SPACE_TO_BATCH_ND,
+    tflite::BuiltinOperator_SPACE_TO_DEPTH,
+    tflite::BuiltinOperator_SQUEEZE,
+    tflite::BuiltinOperator_SUB,
+    tflite::BuiltinOperator_SVDF,
+    tflite::BuiltinOperator_TRANSPOSE,
+    tflite::BuiltinOperator_TRANSPOSE_CONV,
+    tflite::BuiltinOperator_VAR_HANDLE,
+    tflite::BuiltinOperator_ZEROS_LIKE
+  })
+;
+
+
+static std::vector<int> graph_dependent_ops({
+
+    tflite::BuiltinOperator_ASSIGN_VARIABLE,
+    tflite::BuiltinOperator_CALL_ONCE,
+    tflite::BuiltinOperator_IF,
+    tflite::BuiltinOperator_VAR_HANDLE,
+    tflite::BuiltinOperator_READ_VARIABLE,
+  })
+;
+
+int tflmc::Compiler::TrackingErrorReporter::Report(const char* format, va_list args) {
+  vfprintf(stderr, format, args);
+  error_reported_ = true;
+  return 0;
+}
+
+
+
+bool tflmc::CompileFile(const std::string &modelPathName,
+                        const std::string &outSrcPathName,
+                        const std::string &outHdrPathName,
                         const std::string &prefix) {
   // Load model flatbuffer.
-  std::ifstream model_file(modelFileName, std::ios::binary | std::ios::ate);
+  std::ifstream model_file(modelPathName, std::ios::binary | std::ios::ate);
   if (!model_file) {
-    std::cerr << "Could not open " << modelFileName << " for read\n";
+    std::cerr << "Could not open " << modelPathName << " for read\n";
     return false;
   }
   auto sz = model_file.tellg();
@@ -43,23 +226,26 @@ bool tflmc::CompileFile(const std::string &modelFileName,
     return false;
   }
 
-  std::ofstream outFile(outFileName);
+
+  std::ofstream outFile(outSrcPathName);
   if (!outFile) {
-    std::cerr << "Failed to create output file\n";
+    std::cerr << "Failed to create output source file: " << outSrcPathName << std::endl;;
     return false;
   }
 
-  std::ofstream outHeaderFile(outFileName + ".h");
+  std::ofstream outHeaderFile(outHdrPathName);
   if (!outHeaderFile) {
-    std::cerr << "Failed to create output header file\n";
+    std::cerr << "Failed to create output header file: " << outHdrPathName << std::endl;
     return false;
   }
 
   try {
     Compiler compiler(model_data.data(), prefix);
+    
     compiler.writeSource(outFile);
     compiler.writeHeader(outHeaderFile);
-    return true;
+    compiler.reportMemUsage();
+    return compiler.noErrorsReported();
   } catch (const std::exception &e) {
     std::cerr << e.what() << "\n";
   } catch (...) {
@@ -70,7 +256,10 @@ bool tflmc::CompileFile(const std::string &modelFileName,
 }
 
 tflmc::Compiler::Compiler(const void *modelData, const std::string &prefix)
-    : prefix_(prefix) {
+    : prefix_(prefix)
+    , arena_(SUFFICIENT_ARENA_SIZE, SUFFICIENT_ARENA_ALIGNMENT) {
+  aligned_arena_start_ = arena_.alginedBufferStart();
+  arena_size_ = SUFFICIENT_ARENA_SIZE;
   if (!init(modelData)) {
     throw std::runtime_error("Could not set up compiler");
   }
@@ -104,14 +293,24 @@ bool tflmc::Compiler::init(const void *modelData) {
   for (auto outIndex : *subgraph_->outputs()) {
     outputTensorIndices_.push_back(outIndex);
   }
-  tflmc::custom_operator_handle custom = tflmc::LoadCustom(&resolver_);
+  tflmc::custom_operator_handle custom =
+     tflmc::LoadCustom(static_cast<tflite::MicroOpResolver *>(&resolver_));
 
   // Build an interpreter to run the model with.
-  arena_buf_.resize(SUFFICIENT_ARENA_SIZE);
+
   interpreter_ = std::unique_ptr<tflite::MicroInterpreter>(
       new tflite::MicroInterpreter(
-        model_, resolver_, arena_buf_.data(), arena_buf_.size(),
-        &microErrReporter_));
+        model_, resolver_, aligned_arena_start_, arena_size_,
+        &errReporter()));
+
+  // Now know model size etc so we can initialize (tables)
+  // in tensor arena memory map.
+  arenaMap_.init(interpreter_->operators_size());
+
+#if TFLMC_USE_INTERPRETER_HOOKS
+  // Activate hooks to record memory alliocations to fill _arenaMaop etc.
+  tflmc::SetRecordAllocationhooks( interpreter_.get(), aligned_arena_start_, arena_size_);
+#endif
 
   // Allocate memory from the tensor_arena for the model's tensors.
   TfLiteStatus allocate_status = interpreter_->AllocateTensors();
@@ -121,48 +320,46 @@ bool tflmc::Compiler::init(const void *modelData) {
   }
 
   ptrdiff_t ramTensorBufferSize = 0;
-  ptrdiff_t romOffset = 0;
   auto numTensors = tensors->size();
   if (numTensors > 0) {
     auto tensor = GetTensor(interpreter_.get(), 0);
     common_tensor_type = tensor->type;
-    common_tensor_is_variable = tensor->is_variable;
   }
   for (size_t i = 0; i < numTensors; i++) {
     auto tensor = GetTensor(interpreter_.get(), i);
     tensors_.push_back({tensor});
-    if (tensor->allocation_type == kTfLiteMmapRo) {
-      memMap_.recordROM(romOffset, tensor->bytes, getTensorName(i));
-      romOffset += tensor->bytes;
-    } else {
-      ptrdiff_t offset = (uint8_t *)tensor->data.data - arena_buf_.data();
+    if (tensor->allocation_type != kTfLiteMmapRo) {
+      ptrdiff_t offset = (uint8_t *)tensor->data.data - aligned_arena_start_;
       ptrdiff_t highSize = offset + tensor->bytes;
       ramTensorBufferSize = std::max(ramTensorBufferSize, highSize);
-      memMap_.recordRAM(offset, tensor->bytes, getTensorName(i));
+      arenaMap_.recordPersistent(offset, tensor->bytes, getTensorName(i));
     }
+
     // determine whether we need to individually set these properties for each
     // tensor
-    if ((!has_quantization) &&
-        tensor->quantization.type != kTfLiteNoQuantization) {
-      has_quantization = true;
-    }
+    has_quantization |= ( tensor->quantization.type != kTfLiteNoQuantization);
     if ((!common_tensor_type.None) && common_tensor_type.Some != tensor->type) {
       common_tensor_type.clear();
     }
-    if ((!common_tensor_is_variable.None) &&
-        common_tensor_is_variable.Some != tensor->is_variable) {
-      common_tensor_is_variable.clear();
-    }
+    has_is_variable |= tensor->is_variable;
   }
 
+  int unsupported_ops = 0;
   for (size_t i = 0; i < interpreter_->operators_size(); i++) {
-    auto nodeAndReg = interpreter_->node_and_registration(i);
+    auto nodeAndReg = interpreter_->node_and_registration(ILLEGAL_IF_EVER_MULTIPLE_SUBGRAPH,i);
     auto node = &nodeAndReg.node;
     auto reg = nodeAndReg.registration;
     auto code = tflite::EnumValuesBuiltinOperator()[reg->builtin_code];
 
-    printf("operation %lu: %s\n", i, tflite::EnumNamesBuiltinOperator()[code]);
-
+    std::cout << "operation " << i 
+              << ": " << tflite::EnumNamesBuiltinOperator()[code];
+              
+    if (std::find(graph_dependent_ops.begin(), graph_dependent_ops.end(), code) != graph_dependent_ops.end()) {
+      std::cout << " - requires  operator graph access(unsupported)" << std::endl;
+      ++unsupported_ops;
+    } else {
+       std::cout << std::endl;
+    }
     RegistrationInfo regInfo;
     regInfo.reg = reg;
     regInfo.code = code;
@@ -180,85 +377,174 @@ bool tflmc::Compiler::init(const void *modelData) {
     nodes_.push_back(NodeInfo{*node, itOp - registrations_.begin()});
   }
 
-  auto runtimeAllocations = tflmc::RecordAllocations(model_, SUFFICIENT_ARENA_SIZE);
-  ptrdiff_t minRuntimeOffset = 0;  // These are negative so zero start is fine.
-  for (const auto &alloc : runtimeAllocations) {
-    minRuntimeOffset = std::min(minRuntimeOffset, alloc.offset);
+  if (unsupported_ops > 0 ) {
+    errReporter().Report("Model includes %d unsupported operators", unsupported_ops);
+    return false;
+  }
+
+  for (size_t i = 0; i < registrations_.size(); i++) {
+    std::string opName;
+    auto code = registrations_[i].code;
+    if (code == tflite::BuiltinOperator_CUSTOM) {
+      opName = registrations_[i].custom_name;
+    } else {
+      opName = tflite::EnumNameBuiltinOperator(code);
+    }
+
   }
-  size_t totalRuntimeAllocSize = 0;
+
+#if TFLMC_USE_INTERPRETER_HOOKS
+   tflmc::RecordScratchBufferAllocations(interpreter_.get()); 
+#else
+  tflmc::RecordAllocations(model_, SUFFICIENT_ARENA_SIZE, SUFFICIENT_ARENA_ALIGNMENT);
+#endif
+  auto runtimeAllocations = tflmc::RecordedAllocations();
+
   for (const auto &alloc : runtimeAllocations) {
-    // TODO: This drops the alignment between buffers. Is this fine?
-    totalRuntimeAllocSize += alloc.len;
-    ptrdiff_t offset = alloc.offset - minRuntimeOffset + ramTensorBufferSize;
-    memMap_.recordRAM(offset, alloc.len,
-                      "PersistentBuf" + std::to_string(alloc.nodeIndex));
+    switch( alloc.kind ) {
+      case tflmc::AllocKind::Persistent : 
+        arenaMap_.recordPersistent(alloc.offset, alloc.len,
+                      "PersistentBuf_" + std::to_string(alloc.nodeIndex));
+        break;
+      case tflmc::AllocKind::Scratch : 
+        arenaMap_.recordScratchBuf(alloc.buffer_index, alloc.offset, alloc.len, alloc.nodeIndex,
+                     "ScratchBuf_" + std::to_string(alloc.nodeIndex) + "_" +  std::to_string(alloc.buffer_index));
+        break;
+      default:
+        assert(false && "Urecognized allocation kind");
+    }
+
+
   }
 
-  // This includes:
+
+  // At this point memMap only records the tensor arena.
   // - Tensors
   // - Scratch buffers
   // - Persistent buffers
-  // tensor metadata is not included, since we declare them outside the arena
-  arenaBufferSize_ = ramTensorBufferSize + totalRuntimeAllocSize;
-
-  // TODO: This is overestimating by quite a bit because of ABI differences.
-  size_t tensorMetaSize = tensors_.size() * sizeof(TfLiteTensor);
-  size_t nodeMetaSize = nodes_.size() * sizeof(TfLiteNode);
-  memMap_.recordRAM(arenaBufferSize_, tensorMetaSize, "TensorMetadata");
-  memMap_.recordRAM(arenaBufferSize_ + tensorMetaSize, nodeMetaSize,
-                    "NodeMetadata");
-  memMap_.recordRAM(arenaBufferSize_ + tensorMetaSize + nodeMetaSize,
-                    sizeof(TfLiteContext), "TfLiteContext");
-
-  memMap_.report();
-  tflmc::UnloadCustom(custom);
 
+  // Required arena size is end of ram memory usage  after we have
+  // compacted it.  Currently merely by stripping the largest
+  // gap (usual the gap between head/tail of arena)
+  arenaMap_.stripLargestGap(SUFFICIENT_ARENA_ALIGNMENT);
+  tflmc::UnloadCustom(custom);
   return true;
 }
 
-void tflmc::Compiler::writeSource(std::ostream &out) {
-  CodeWriter wr(out, subgraph_);
+void tflmc::Compiler::finalizeMemMap(const CodeWriter &wr)
+{
+  size_t tensorMetaSize = tensors_.size() * (sizeof(TfLiteTensor)+sizeof(TfLiteEvalTensor));
+  uninitMemMap_.record(tensorMetaSize, "TfliteTensorTables");
 
-  wr << R"(
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/kernels/micro_ops.h"
+  auto TensorInfo_t_size = 
+    tflmc::GeneratedTensorInfo::size(!common_tensor_type.None, has_quantization, has_is_variable);
+  size_t tensorInfoSize = tensors_.size() * TensorInfo_t_size;
+  constMemMap_.record(tensorInfoSize, "TensorInfo");
 
-#if defined __GNUC__
-#define ALIGN(X) __attribute__((aligned(X)))
-#elif defined _MSC_VER
-#define ALIGN(X) __declspec(align(X))
-#elif defined __TASKING__
-#define ALIGN(X) __align(X)
+  initMemMap_.record(sizeof(TfLiteContext), "TfLiteContext");
+  constMemMap_.record(sizeof(TfLiteContext), "TfLiteContext");
+
+  auto NodeInfo_t_size = tflmc::GeneratedNodeInfo::size(has_custom_ops);
+  size_t nodeMetaSize = nodes_.size() * NodeInfo_t_size;
+  constMemMap_.record(nodeMetaSize, "NodeDataTable");
+
+  size_t registrationsSize = registrations_.size() * sizeof(TfLiteRegistration);
+  initMemMap_.record(registrationsSize, "OpRegistrations");
+
+  constMemMap_.record(wr.constDataUsage(), "TensorAndOpdata");
+  initMemMap_.record(wr.initDataUsage(), "TensorAndOpdata");
+  uninitMemMap_.record(wr.uninitDataUsage(), "TensorAndOpdata");
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  constMemMap_.record(tflite::micro::constDataUsage(), "OpUserData");
+  initMemMap_.record(tflite::micro::initDataUsage(), "OpUserData");
+  uninitMemMap_.record(tflite::micro::uninitDataUsage(), "OpUserData");
 #endif
 
-)";
+}
+
+
+void tflmc::Compiler::reportMemUsage()
+{
+ 
+  size_t romUsage = constMemMap_.size() + initMemMap_.size();
+  std::fstream memmap_json;
+  auto options = Options::instance();
+  if (!options.memmap_json.empty()) {
+    	memmap_json.open(options.memmap_json, std::fstream::out);
+      if (!memmap_json) {
+        std::cerr << "Could not open '" << options.memmap_json << "' for writing." << std::endl;
+        exit(1);
+      }
+
+     memmap_json << "{" << std::endl;
+  }
+  if (memmap_json.is_open()) {
+         memmap_json << "\"rodata\": " << constMemMap_.size() << "," << std::endl;         
+         memmap_json << "\"data\": " << initMemMap_.size() << "," << std::endl;        
+         memmap_json << "\"bss\": " << uninitMemMap_.size() << "," << std::endl;
+  }
+  std::cout << "ROM summary: "<< romUsage << " bytes total" << std::endl;
+  if (memmap_json.is_open()) {
+         memmap_json << "\"rom\": " << romUsage << "," << std::endl;
+  }
+
+  size_t ramUsage = uninitMemMap_.size() + initMemMap_.size();
+  if (memmap_json.is_open()) {
+    memmap_json << "\"ram\": " << ramUsage << std::endl;
+    memmap_json << "}" << std::endl;
+    memmap_json.close();
+    if (!memmap_json) {
+      std::cerr << "Could not write '" << options.memmap_json << "'." << std::endl;
+      exit(1);
+    }
+  }
+
+  constMemMap_.report("const data (.rodata)");
+  initMemMap_.report("initalized data (.data)");
+  uninitMemMap_.report("uninitalized data (.bss)");
+  arenaMap_.report("Tensor Arena details");
+}
+
+
+void tflmc::Compiler::writeCustomRegistrationsSource(CodeWriter &wr) {
+
   // declare custom registrations
   if (has_custom_ops) {
     wr << R"(namespace tflite {
-namespace ops {
 namespace micro {
 )";
     for (size_t i = 0; i < registrations_.size(); i++) {
       if (registrations_[i].code == tflite::BuiltinOperator_CUSTOM) {
-        wr << "extern TfLiteRegistration Register_"
+        wr << "extern TfLiteRegistration *Register_"
            << registrations_[i].custom_name << "(void);\n";
       }
     }
     wr << R"(}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
 
 )";
   }
+}
+
+
+
+void tflmc::Compiler::writeTypesAndWorkingArraysSource(CodeWriter &wr) {
+  
   wr << R"(namespace {
 
-constexpr int kTensorArenaSize = )"
-     << arenaBufferSize_ << R"(;
-uint8_t tensor_arena[kTensorArenaSize] ALIGN(16);
+)";
+
+  wr.writeTensorArena(arenaMap_.size());
+  wr << R"(
+
 template <int SZ, class T> struct TfArray {
   int sz; T elem[SZ];
 };
+)";
+
+  wr << R"(
+
 enum used_operators_e {
   )";
   for (size_t i = 0; i < registrations_.size(); i++) {
@@ -269,43 +555,43 @@ enum used_operators_e {
          << ", ";
     }
   }
+  
   wr << R"( OP_LAST
 };
-struct TensorInfo_t { // subset of TfLiteTensor used for initialization from constant memory
-)";
-  if (common_tensor_type.None) {
-    wr << "  TfLiteType type;\n";
-  }
-  wr << R"(  void* data;
-  TfLiteIntArray* dims;
-  size_t bytes;
-)";
-  if (has_quantization) {
-    wr << "  TfLiteQuantization quantization;\n";
-  }
-  if (common_tensor_is_variable.None) {
-    wr << "  bool is_variable;\n";
-  }
-  wr << R"(};
-struct NodeInfo_t { // subset of TfLiteNode used for initialization from constant memory
-  struct TfLiteIntArray* inputs;
-  struct TfLiteIntArray* outputs;
-  void* builtin_data;
-  used_operators_e used_op_index;
+
 )";
-  if (has_custom_ops) {
-    wr << "  int custom_initial_data_size;\n";
-  }
-  wr << R"(};
+
+  wr <<
+    tflmc::GeneratedTensorInfo::generated(common_tensor_type.None, has_quantization, has_is_variable);
+
+  wr <<
+    tflmc::GeneratedNodeInfo::generated(has_custom_ops);
+
+  wr << R"(
 
 TfLiteContext ctx{};
-TfLiteTensor tflTensors[)"
-     << tensors_.size() << R"(];
+
+// Tensor table with space for -1-th element used
+// designate missing optional inputs/outputs.
+TfLiteTensor tflTensorsWithMinus1[)"
+     << tensors_.size()+1u << R"(];
+     
 TfLiteEvalTensor evalTensors[)"
      << tensors_.size() << R"(];
+
+TfLiteTensor * const tflTensors = tflTensorsWithMinus1+1;
+
 TfLiteRegistration registrations[OP_LAST];
-TfLiteNode tflNodes[)"
-     << nodes_.size() << R"(];
+)";
+
+}
+
+
+void tflmc::Compiler::writeTflNodesSource(CodeWriter &wr) {
+  
+  wr << "constexpr size_t kOpNodesCount = " << nodes_.size() <<";\n\n";
+  wr << R"(
+TfLiteNode tflNodes[kOpNodesCount];
 
 )";
   for (size_t i = 0; i < tensors_.size(); i++) {
@@ -315,19 +601,13 @@ TfLiteNode tflNodes[)"
     }
     wr.writeIntArray(*t->dims, "tensor_dimension" + std::to_string(i));
     wr.writeQuantization(t->quantization, "quant" + std::to_string(i));
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION
-    wr.writeQuantizationDetails(t->quantization, "quant_details" + std::to_string(i));
-#endif
   }
   for (size_t i = 0; i < nodes_.size(); i++) {
     auto &node = nodes_[i].node;
     auto &regInfo = registrations_[nodes_[i].regIndex];
     if (regInfo.code == tflite::BuiltinOperator_CUSTOM) {
-      wr << "uint8_t ALIGN(4) opdata" + std::to_string(i) << "["
-         << node.custom_initial_data_size << "] = { ";
-      for (int j = 0; j < node.custom_initial_data_size; ++j)
-        wr << int(((uint8_t const *)node.custom_initial_data)[j]) << ", ";
-      wr << " }; /* custom_initial_data */\n";
+      wr.writeCustom((uint8_t const *)node.custom_initial_data, i, 
+                     node.custom_initial_data_size);
     } else {
       wr.writeBuiltin(regInfo.code, node.builtin_data,
                       "opdata" + std::to_string(i));
@@ -335,6 +615,11 @@ TfLiteNode tflNodes[)"
     wr.writeIntArray(*node.inputs, "inputs" + std::to_string(i));
     wr.writeIntArray(*node.outputs, "outputs" + std::to_string(i));
   }
+}
+
+
+void tflmc::Compiler::writeTensorDataSource(CodeWriter &wr) {
+  
   wr << R"(const TensorInfo_t tensorData[] = {
 )";
   for (size_t i = 0; i < tensors_.size(); i++) {
@@ -347,7 +632,7 @@ TfLiteNode tflNodes[)"
       wr << "(void*)tensor_data" << i;
     } else {
       wr << "tensor_arena + "
-         << ((uintptr_t)t->data.data - (uintptr_t)arena_buf_.data());
+         << ((uintptr_t)t->data.data - (uintptr_t)aligned_arena_start_);
     }
     wr << ", "
        << "(TfLiteIntArray*)&tensor_dimension" << i << ", ";
@@ -357,29 +642,29 @@ TfLiteNode tflNodes[)"
         wr << "{kTfLiteAffineQuantization, "
               "const_cast<void*>(static_cast<const void*>(&quant"
            << i << ")) ";
+#if SUPPORT_CUSTOM_QUANT
+      } else if (t->quantization.type == kTfLitePackedAffineQuantization) {
+          wr << "{kTfLitePackedAffineQuantization, "
+                "const_cast<void*>(static_cast<const void*>(&quant"
+             << i << ")) ";
+#endif  // SUPPORT_CUSTOM_QUANT
       } else {
         wr << "{kTfLiteNoQuantization, nullptr ";
       }
 
-#if TF_LITE_PACKED_QUANTIZED_DATA_VERSION
-      if (t->quantization.details.type == kTfLiteSub8BitPackedUniformDetail) {
-        wr << ", {kTfLiteSub8BitPackedUniformDetail, "
-              "{&quant_details"
-           << i << "}}";
-      } else {
-        wr << ", {kTfLiteNoDetails, {}}";
-      }
-#endif
+
       wr << "},";
     }
-    if (common_tensor_is_variable.None) {
-      wr << std::to_string(t->is_variable)
-         << ", ";  // TODO: is there a bool to string?
+    if (has_is_variable) {
+      wr << t->is_variable << ", ";
     }
     wr << "},\n";
   }
   wr << "};\n";
-  wr << R"(const NodeInfo_t nodeData[] = {
+}
+
+void tflmc::Compiler::writeNodeDataSource(CodeWriter &wr) {
+  wr << R"(const NodeInfo_t nodeData[kOpNodesCount] = {
 )";
   for (size_t i = 0; i < nodes_.size(); i++) {
     wr << "  { (TfLiteIntArray*)&inputs" << i << ", ";
@@ -406,11 +691,45 @@ TfLiteNode tflNodes[)"
     }
     wr << "},\n";
   }
-  wr << "};";
-  // TODO: This code assumes that persistent allocations are made from the end
-  // (which is true for the current implementation)
+  wr << "};\n\n";
+
+}
+
+
+void tflmc::Compiler::writeScratchBufferOffsets(CodeWriter &wr) {
+
+  // Complication: nodes with offline pre-computed user_data (OpData)
+  // won't actually call RequestScratchBufferInArena
+  // so we need to compute correct  next_scratch_buffer_idx for each node
+  // from calls made during pre-interpretation
+  wr << R"(
+  // Used by RequestScratchBufferInArena to generate buffer index
+  // for each request.  Reset for each node from _init to allow
+  // for nodes omitting calls as scratch buffer indexes is in pre-computed OpData
+  int next_scratch_buffer_idx;
+  )";
+
+  wr.writeArray(arenaMap_.nodesScratchBufferAllocationCounts(), sizeof(uint8_t), true,
+    "const uint8_t", "node_scratch_buffer_requests"
+  );
+  wr << "\n";
+  wr.writeArray(arenaMap_.scratchBufOffsets(), sizeof(size_t),  true,
+    "const size_t", "scratchbuf_offsets");
+}
+
+
+
+void tflmc::Compiler::writeContextAllocationHandlersSource(CodeWriter &wr) {
+
+  // We assume that persistent allocations are made from the end
+  // of the arena downwards.  We should really have have a CI test 
+  // to verify this explicitly but it is VERY unlikely the other
+  // tests will pass if tflite(u) changes this one day.
+  // Obviously adding support for external memory allocation 
+  // would complicate this... 
+
   wr << R"(
-static void* AllocatePersistentBuffer(struct TfLiteContext* ctx,
+void *AllocatePersistentBuffer(struct TfLiteContext* ignore,
                                                  size_t bytes) {
   static uint8_t *AllocPtr = tensor_arena + sizeof(tensor_arena);
 
@@ -418,22 +737,152 @@ static void* AllocatePersistentBuffer(struct TfLiteContext* ctx,
   return AllocPtr;
 }
 
-static TfLiteEvalTensor *GetEvalTensor(const struct TfLiteContext *context,
+TfLiteEvalTensor *GetEvalTensor(const struct TfLiteContext *ignore,
                                        int tensor_idx) {
   return &evalTensors[tensor_idx];
 }
-} // namespace
+)";
+
+// Scratch buffers are "easy" - we simply re-use the allocations
+// from our offline init/prepare phases.  This must of course
+// match the target build.  Worse case same kernel library,
+// target compiler settings and target compiler.
+// Complication: nodes with offline pre-computed user_data (OpData)
+// won't actually call RequestScratchBufferInArena
+// so we record the calls each node made and corect next_scratch_buffer_idx
+// from that after each prepare call.
 
-TfLiteStatus )"
+  wr << R"(
+TfLiteStatus RequestScratchBufferInArena(TfLiteContext *ignored,
+                                                size_t bytes_ignored,
+                                                int *buffer_idx) {
+  *buffer_idx = next_scratch_buffer_idx;
+  ++next_scratch_buffer_idx;
+  return kTfLiteOk;
+}
+
+void* GetScratchBuffer(struct TfLiteContext *ignore, int buffer_idx) {
+  return tensor_arena + scratchbuf_offsets[buffer_idx];
+}
+)";
+
+}
+
+
+void tflmc::Compiler::writeMicroContextSource(CodeWriter &wr) {
+
+  wr << R"(
+class )" << prefix_ << R"(PreinterpretedMicroContext : public tflite::MicroContext {
+ public:
+   )" << prefix_ << R"(PreinterpretedMicroContext() : 
+    tflite::MicroContext(nullptr, nullptr, nullptr) {}
+
+  // Allocate persistent buffer which has the same life time as the interpreter.
+  // Returns nullptr on failure.
+  // The memory is allocated from the tail.
+  // This method is only available in Init or Prepare stage.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void* AllocatePersistentBuffer(size_t bytes) {
+    return ::AllocatePersistentBuffer(nullptr, bytes);
+  }
+
+  // Request a scratch buffer in the arena through static memory planning.
+  // This method is only available in Prepare stage and the buffer is allocated
+  // by the interpreter between Prepare and Eval stage. In Eval stage,
+  // GetScratchBuffer API can be used to fetch the address.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteStatus RequestScratchBufferInArena(size_t bytes,
+                                                   int* buffer_idx) {
+    return ::RequestScratchBufferInArena(nullptr, bytes, buffer_idx);
+  }
+
+  // Get the scratch buffer pointer.
+  // This method is only available in Eval stage.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void* GetScratchBuffer(int buffer_idx) {
+    return ::GetScratchBuffer(nullptr, buffer_idx);
+  }
+
+  // Returns a temporary TfLiteTensor struct for a given index.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteTensor* AllocateTempTfLiteTensor(int tensor_idx) {
+    return tensor_idx >= 0 ? &tflTensors[tensor_idx] : nullptr;
+  }
+
+  // Returns a temporary TfLiteTensor struct for the specified input tensor of a
+  // given mode. This is the recommended API over the deprecated
+  // GetInput/GetInputSafe to get a temp input tensor. The returned tensor shall
+  // be freed via calling DeallocateTempTfLiteTensor.
+  virtual TfLiteTensor* AllocateTempInputTensor(const TfLiteNode* node,
+                                                int index) {
+    return AllocateTempTfLiteTensor(node->inputs->data[index]);
+  }
+
+  // Returns a temporary TfLiteTensor struct for the specified output tensor of
+  // a given mode. This is the recommended API over the deprecated
+  // GetOutput/GetOutputSafe to get a temp output tensor. The returned tensor
+  // shall be freed via calling DeallocateTempTfLiteTensor.
+  virtual TfLiteTensor* AllocateTempOutputTensor(const TfLiteNode* node,
+                                                 int index) {
+    return AllocateTempTfLiteTensor(node->outputs->data[index]);
+  }
+
+  // Deallocates a temp TfLiteTensor.
+  // Virtual so that it can be faked for kernel tests.
+  virtual void DeallocateTempTfLiteTensor(TfLiteTensor* tensor) {
+    // No-op
+  }
+
+  // Returns a TfLiteEvalTensor struct for a given index.
+  // Virtual so that it can be faked for kernel tests.
+  virtual TfLiteEvalTensor* GetEvalTensor(int tensor_idx) {
+    return ::GetEvalTensor(nullptr, tensor_idx);
+  }
+
+
+  // Does not take ownership of the pointer and the pointer must refer to valid
+  // an object that outlive this class instance.
+  // This can only be called once to set one external context.
+  TfLiteStatus set_external_context(void* external_context_payload);
+
+  void* external_context() { return external_context_payload_; }
+protected:
+  void* external_context_payload_ = nullptr;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+)";
+
+}
+
+
+void tflmc::Compiler::writeInitSource(CodeWriter &wr) {
+
+  wr << R"(extern "C" TfLiteStatus )"
      << prefix_ << R"(init() {
   ctx.AllocatePersistentBuffer = &AllocatePersistentBuffer;
+  ctx.RequestScratchBufferInArena = &RequestScratchBufferInArena;
+  ctx.GetScratchBuffer = &GetScratchBuffer;
   ctx.GetEvalTensor = &GetEvalTensor;
   ctx.tensors = tflTensors;
 )";
   wr << "  ctx.tensors_size = " << tensors_.size() << ";\n";
+
+  wr << R"(
+  static )" << prefix_ << R"(PreinterpretedMicroContext u_ctx;
+  ctx.impl_ = static_cast<void *>(&u_ctx);
+)";
+
   // TODO: Do we really support variable tensors?
   // TODO: Do we encounter other than kTfLiteMmapRo and kTfLiteArenaRw, if so we
   // need to store the type separately.
+
+  wr << R"(
+  TfLiteIntArray dimsEmptyTensor = {0};
+  tflTensors[-1].dims = &dimsEmptyTensor;
+  tflTensors[-1].data.raw = nullptr;
+)";
   wr << "  for(size_t i = 0; i < " << tensors_.size() << R"(; ++i) {
     tflTensors[i].data.data = tensorData[i].data;
     evalTensors[i].data.data = tensorData[i].data;
@@ -447,11 +896,10 @@ TfLiteStatus )"
     wr << "    evalTensors[i].type = "
        << tflmc::to_string(common_tensor_type.Some) << ";\n";
   }
-  if (common_tensor_is_variable.None) {
+  if (has_is_variable) {
     wr << "    tflTensors[i].is_variable = tensorData[i].is_variable;\n";
   } else {
-    wr << "    tflTensors[i].is_variable = "
-       << std::to_string(common_tensor_is_variable.Some) << ";\n";
+    wr << "    tflTensors[i].is_variable = false;\n";
   }
   wr << R"(    tflTensors[i].allocation_type = (tensor_arena <= tensorData[i].data && tensorData[i].data < tensor_arena + kTensorArenaSize) ? kTfLiteArenaRw : kTfLiteMmapRo;
     tflTensors[i].bytes = tensorData[i].bytes;
@@ -464,6 +912,10 @@ TfLiteStatus )"
       TfLiteAffineQuantization const* quant = ((TfLiteAffineQuantization const*)(tensorData[i].quantization.params));
       tflTensors[i].params.scale = quant->scale->data[0];
       tflTensors[i].params.zero_point = quant->zero_point->data[0];
+    } else if (tflTensors[i].quantization.type == kTfLitePackedAffineQuantization) {
+      TfLitePackedAffineQuantization const* quant = (TfLitePackedAffineQuantization const*)(tensorData[i].quantization.params);
+      tflTensors[i].params.scale = quant->affine.scale->data[0];
+      tflTensors[i].params.zero_point = quant->affine.zero_point->data[0];
     }
 )";
   } else {
@@ -471,18 +923,25 @@ TfLiteStatus )"
   }
   wr << R"(  }
 )";
+
   for (size_t i = 0; i < registrations_.size(); i++) {
     std::string opName;
-    if (registrations_[i].code == tflite::BuiltinOperator_CUSTOM) {
+    auto code = registrations_[i].code;
+    if (code == tflite::BuiltinOperator_CUSTOM) {
       opName = registrations_[i].custom_name;
     } else {
-      opName = tflite::EnumNameBuiltinOperator(registrations_[i].code);
+      opName = tflite::EnumNameBuiltinOperator(code);
     }
-    wr << "  registrations[OP_" << opName << "] = tflite::ops::micro::Register_"
-       << opName << "();\n";
   }
   wr << "\n";
-  wr << "  for(size_t i = 0; i < " << nodes_.size() << R"(; ++i) {
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << R"(
+#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+tflite::micro::resetOfflineOpUserData( tflite::micro::)" << prefix_ << R"(model::precomputed_op_user_data);
+#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+)";
+#endif
+  wr << R"(  for(size_t i = 0; i < kOpNodesCount; ++i) {
     tflNodes[i].inputs = nodeData[i].inputs;
     tflNodes[i].outputs = nodeData[i].outputs;
     tflNodes[i].builtin_data = nodeData[i].builtin_data;
@@ -499,45 +958,129 @@ TfLiteStatus )"
     }
   }
 )";
-  wr << "  for(size_t i = 0; i < " << nodes_.size() << R"(; ++i) {
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << R"(
+#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+tflite::micro::resetOfflineOpUserData( tflite::micro::)" << prefix_ << R"(model::precomputed_op_user_data);
+#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+)";
+#endif
+
+  wr << R"(  size_t precomputed_sb_idx_ctr = 0;
+  
+  for(size_t i = 0; i < kOpNodesCount; ++i) {
+    next_scratch_buffer_idx = precomputed_sb_idx_ctr;
     if (registrations[nodeData[i].used_op_index].prepare) {
       TfLiteStatus status = registrations[nodeData[i].used_op_index].prepare(&ctx, &tflNodes[i]);
       if (status != kTfLiteOk) {
         return status;
       }
     }
+    precomputed_sb_idx_ctr += node_scratch_buffer_requests[i];
   }
   return kTfLiteOk;
 }
+)";
 
-static const int inTensorIndices[] = {
-  )";
-  for (auto inIndex : inputTensorIndices_) {
-    out << inIndex << ", ";
-  }
-  out << R"(
-};
-TfLiteTensor* )"
-      << prefix_ << R"(input(int index) {
-  return &ctx.tensors[inTensorIndices[index]];
 }
 
-static const int outTensorIndices[] = {
-  )";  // TODO: perhaps use a smaller type than int?
-  for (auto outIndex : outputTensorIndices_) {
-    out << outIndex << ", ";
+
+void tflmc::Compiler::writeTensorAccessorsSource(CodeWriter &wr) {
+  wr << R"(
+extern "C" TfLiteTensor* )"
+        << prefix_ << R"(input(int index) {  
+    static const int inTensorIndices[] = {
+    )";
+    for (auto inIndex : inputTensorIndices_) {
+      wr << inIndex << ", ";
+    }
+    wr << R"(
+    };
+    return &ctx.tensors[inTensorIndices[index]];
   }
-  out << R"(
-};
-TfLiteTensor* )"
-      << prefix_ << R"(output(int index) {
-  return &ctx.tensors[outTensorIndices[index]];
+
+extern "C" TfLiteTensor* )"
+        << prefix_ << R"(output(int index) {
+    static const int outTensorIndices[] = {
+    )";  // TODO: perhaps use a smaller type than int?
+    for (auto outIndex : outputTensorIndices_) {
+      wr << outIndex << ", ";
+    }
+    wr << R"(
+    };
+    return &ctx.tensors[outTensorIndices[index]];
+  }
+  )";
+
+  
+  std::string code = R"(
+
+// Returns the number of input tensors.
+extern "C" size_t %PREFIX%inputs() {
+  return )" + std::to_string(inputTensorIndices_.size()) +
+                     R"(;
+}
+// Returns the number of output tensors.
+extern "C" size_t %PREFIX%outputs() {
+  return )" + std::to_string(outputTensorIndices_.size()) +
+                     R"(;
+}
+
+extern "C" void *%PREFIX%input_ptr(int index) {
+  return %PREFIX%input(index)->data.data;
+}
+extern "C" size_t %PREFIX%input_size(int index) {
+  return %PREFIX%input(index)->bytes;
+}
+extern "C" int %PREFIX%input_dims_len(int index) {
+  return %PREFIX%input(index)->dims->size;
+}
+extern "C" int *%PREFIX%input_dims(int index) {
+  return &%PREFIX%input(index)->dims->data[0];
+}
+
+extern "C" void *%PREFIX%output_ptr(int index) {
+  return %PREFIX%output(index)->data.data;
+}
+extern "C" size_t %PREFIX%output_size(int index) {
+  return %PREFIX%output(index)->bytes;
+}
+extern "C" int %PREFIX%output_dims_len(int index) {
+  return %PREFIX%output(index)->dims->size;
+}
+extern "C" int *%PREFIX%output_dims(int index) {
+  return &%PREFIX%output(index)->dims->data[0];
+}
+
+)";
+
+  static std::regex rePrefix("%PREFIX%");
+  code = std::regex_replace(code, rePrefix, prefix_);
+
+  wr << code;
+
 }
 
-TfLiteStatus )"
+
+void tflmc::Compiler::writeInvokeSource(CodeWriter &wr) {
+  wr << R"(
+
+extern "C" TfLiteStatus )"
       << prefix_ << R"(invoke() {
-  for(size_t i = 0; i < )"
-      << nodes_.size() << R"(; ++i) {
+)";
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << R"(
+#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+tflite::micro::resetOfflineOpUserData( tflite::micro::)" << prefix_ << R"(model::precomputed_op_user_data);
+#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+)";
+#endif
+  wr << R"(
+  for(size_t i = 0; i < kOpNodesCount; ++i) {
+#if LOG_OP_INPUTS
+    tflite::logOpInvoke(&ctx,  &tflNodes[i]);
+#endif
     TfLiteStatus status = registrations[nodeData[i].used_op_index].invoke(&ctx, &tflNodes[i]);
     if (status != kTfLiteOk) {
       return status;
@@ -545,11 +1088,92 @@ TfLiteStatus )"
   }
   return kTfLiteOk;
 }
+
+)";
+}
+
+
+void tflmc::Compiler::writeSource(std::ostream &out) {
+  CodeWriter wr(out, subgraph_, microErrReporter_);
+
+  wr << R"(
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#if LOG_OP_INPUTS
+#include "tensorflow/lite/micro/micro_invoke_log.h"
+#endif
+)";
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << R"(
+#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+#include "tensorflow/lite/micro/kernels/ifx_common/offline_prepare_utils.h" 
+#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+)";
+#endif
+
+  wr << R"(
+
+#if defined __GNUC__
+#define ALIGN(X) __attribute__((aligned(X)))
+#elif defined _MSC_VER
+#define ALIGN(X) __declspec(align(X))
+#elif defined __TASKING__
+#define ALIGN(X) __align(X)
+#endif
+
+)";
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << "#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA\n";
+  tflite::micro::writeStaticOpDataHeaders(out);
+  wr << "#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA\n";
+#endif
+
+  writeCustomRegistrationsSource(wr);
+
+  writeTypesAndWorkingArraysSource(wr);
+
+  writeTflNodesSource(wr);
+
+  writeTensorDataSource(wr);
+
+  writeNodeDataSource(wr);
+
+  writeScratchBufferOffsets(wr);
+
+  writeContextAllocationHandlersSource(wr);
+
+// TODO:  Really need to support AllocateBufferForEval.  Should be easy - just need to
+// permit allocating a suitable "gap" in the arena or a dedicated scratchpad area.
+
+wr << R"(
+} // namespace
 )";
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+  wr << "#if TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA\n";
+  tflite::micro::writeStaticOpDataDefinitions(prefix_, out);
+  wr << "#endif  // TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA\n";
+#endif
+
+  writeMicroContextSource(wr);
+
+  writeInitSource(wr);
+
+  writeTensorAccessorsSource(wr);
+  
+  writeInvokeSource(wr);
+
+  finalizeMemMap(wr);
 }
 
+
 void tflmc::Compiler::writeHeader(std::ostream &out) {
-  tflmc::CodeWriter wr(out, subgraph_);
+  tflmc::CodeWriter wr(out, subgraph_, errReporter());
 
   std::string code = R"(
 #ifndef %PREFIX%GEN_H
@@ -557,6 +1181,18 @@ void tflmc::Compiler::writeHeader(std::ostream &out) {
 
 #include "tensorflow/lite/c/common.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define %PREFIX%MODEL_CONST_DATA_SIZE )"+ std::to_string(constMemMap_.size()) +
+                R"(
+#define %PREFIX%MODEL_INIT_DATA_SIZE )"+ std::to_string(initMemMap_.size()) +
+                R"(
+#define %PREFIX%MODEL_UNINIT_DATA_SIZE )"+ std::to_string(uninitMemMap_.size()) +
+                R"(
+
+
 // Sets up the model with init and prepare steps.
 TfLiteStatus %PREFIX%init();
 // Returns the input tensor with the given index.
@@ -567,41 +1203,38 @@ TfLiteTensor *%PREFIX%output(int index);
 TfLiteStatus %PREFIX%invoke();
 
 // Returns the number of input tensors.
-inline size_t %PREFIX%inputs() {
-  return )" + std::to_string(inputTensorIndices_.size()) +
-                     R"(;
-}
+size_t %PREFIX%inputs();
+
 // Returns the number of output tensors.
-inline size_t %PREFIX%outputs() {
-  return )" + std::to_string(outputTensorIndices_.size()) +
-                     R"(;
-}
+size_t %PREFIX%outputs();
 
-inline void *%PREFIX%input_ptr(int index) {
-  return %PREFIX%input(index)->data.data;
-}
-inline size_t %PREFIX%input_size(int index) {
-  return %PREFIX%input(index)->bytes;
-}
-inline int %PREFIX%input_dims_len(int index) {
-  return %PREFIX%input(index)->dims->data[0];
-}
-inline int *%PREFIX%input_dims(int index) {
-  return &%PREFIX%input(index)->dims->data[1];
-}
+// Return the buffer pointer of input tensor
+void *%PREFIX%input_ptr(int index);
 
-inline void *%PREFIX%output_ptr(int index) {
-  return %PREFIX%output(index)->data.data;
-}
-inline size_t %PREFIX%output_size(int index) {
-  return %PREFIX%output(index)->bytes;
-}
-inline int %PREFIX%output_dims_len(int index) {
-  return %PREFIX%output(index)->dims->data[0];
-}
-inline int *%PREFIX%output_dims(int index) {
-  return &%PREFIX%output(index)->dims->data[1];
-}
+// Return the buffer size of input tensor
+size_t %PREFIX%input_size(int index);
+
+// Return the dimention size of input tensor
+int %PREFIX%input_dims_len(int index);
+
+// Return the dimention buffer pointer of input tensor
+int *%PREFIX%input_dims(int index);
+
+// Return the buffer pointer of output tensor
+void *%PREFIX%output_ptr(int index);
+
+// Return the buffer size of output tensor
+size_t %PREFIX%output_size(int index);
+
+// Return the dimention size of output tensor
+int %PREFIX%output_dims_len(int index);
+
+// Return the dimention buffer pointer of output tensor
+int *%PREFIX%output_dims(int index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
 
 #endif
 )";
@@ -620,7 +1253,7 @@ std::string tflmc::Compiler::getTensorName(int tensorIndex) const {
 
   auto nOps = interpreter_->operators_size();
   for (size_t i = 0; i < nOps; i++) {
-    auto nodeAndReg = interpreter_->node_and_registration(i);
+    auto nodeAndReg = interpreter_->node_and_registration(ILLEGAL_IF_EVER_MULTIPLE_SUBGRAPH,i);
     auto node = &nodeAndReg.node;
 
     auto checkAndAdd = [&](const TfLiteIntArray *indices,
@@ -642,3 +1275,7 @@ std::string tflmc::Compiler::getTensorName(int tensorIndex) const {
 
   return ss.str();
 }
+
+bool tflmc::Compiler::noErrorsReported() const { 
+  return ! microErrReporter_.getErrorReported();
+}
diff --git a/src/Compiler.h b/src/Compiler.h
index 3a69b27..b2ff90f 100644
--- a/src/Compiler.h
+++ b/src/Compiler.h
@@ -4,16 +4,20 @@
 #include <iostream>
 
 #include "MemMap.h"
+#include "ModelInfo.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
-#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflmc {
 
-bool CompileFile(const std::string &modelFileName,
-                 const std::string &outFileName,
-                 const std::string &prefix = "model_");
+class CodeWriter;
+
+bool CompileFile(const std::string &modelPathName,
+                 const std::string &outSrcPathName,
+                 const std::string &outHdrPathName,
+                 const std::string &prefix);
 
 class Compiler {
  public:
@@ -23,67 +27,74 @@ class Compiler {
 
   void writeSource(std::ostream &out);
   void writeHeader(std::ostream &out);
-
+  void reportMemUsage();
+  
   // Returns a name that describes a tensors relation to network layers.
   std::string getTensorName(int tensorIndex) const;
 
+  bool noErrorsReported() const;
+
  private:
   bool init(const void *modelData);
   tflite::ErrorReporter &errReporter() { return microErrReporter_; }
 
+  void writeCustomRegistrationsSource(CodeWriter &wr);
+
+  void writeTflNodesSource(CodeWriter &wr);
+
+  void writeTensorDataSource(CodeWriter &wr);
+
+  void writeTypesAndWorkingArraysSource(CodeWriter &wr);
+
+  void writeNodeDataSource(CodeWriter &wr);
+
+  void writeScratchBufferOffsets(CodeWriter &wr);
+
+  void writeContextAllocationHandlersSource(CodeWriter &wr);
+
+  void writeMicroContextSource(CodeWriter &wr);
+
+  void writeInitSource(CodeWriter &wr);
+
+  void writeTensorAccessorsSource(CodeWriter &wr);
+
+  void writeInvokeSource(CodeWriter &wr);
+
+  void finalizeMemMap(const CodeWriter &wr);
+
  private:
-  struct TensorInfo {
-    TensorInfo(const TfLiteTensor *tensor_ptr) :
-      tensor(tensor_ptr)
-    {}
-    const TfLiteTensor *tensor = nullptr;
-  };
-  struct RegistrationInfo {
-    const TfLiteRegistration *reg = nullptr;
-    tflite::BuiltinOperator code;
-    std::string custom_name;
-    bool operator==(const RegistrationInfo &other) {
-      if (code != other.code) return false;
-      if (code == tflite::BuiltinOperator_CUSTOM) {
-        return custom_name == other.custom_name;
-      } else
-        return true;
-    }
-  };
-  struct NodeInfo {
-    NodeInfo() {}
-    NodeInfo(TfLiteNode tfl_node, ptrdiff_t reg_index) :
-      node(tfl_node),
-      regIndex(reg_index)
-    {}
-    TfLiteNode node;
-    ptrdiff_t regIndex = -1;
-  };
-  template <class T>
-  struct Option {
-    bool None = true;
-    T Some = T();
-    void operator=(T const &val) {
-      None = false;
-      Some = val;
-    }
-    void clear() {
-      Some = T();
-      None = true;
-    }
+
+  /**
+   * @brief Error reporter that tracks if Error was reported.
+   * 
+   */
+  class TrackingErrorReporter : public tflite::ErrorReporter {
+    public:
+
+      ~TrackingErrorReporter() {}
+      int Report(const char* format, va_list args) override;
+
+      bool getErrorReported() const { return error_reported_; }
+
+    private:
+
+      bool error_reported_ = false;
   };
 
- private:
   std::string prefix_;
-  tflite::MicroErrorReporter microErrReporter_;
+  TrackingErrorReporter microErrReporter_;
   const tflite::Model *model_ = nullptr;
   const tflite::SubGraph *subgraph_ = nullptr;
   tflite::AllOpsResolver resolver_;
-  std::vector<uint8_t> arena_buf_;
+  SufficientArena arena_;
+  uint8_t *aligned_arena_start_;
+  size_t arena_size_; 
   std::unique_ptr<tflite::MicroInterpreter> interpreter_;
-  MemMap memMap_;
+  ArenaMemMap arenaMap_;
+  MemMap      initMemMap_;
+  MemMap      uninitMemMap_;
+  MemMap      constMemMap_;
 
-  size_t arenaBufferSize_ = 0;
   std::vector<TensorInfo> tensors_;
   std::vector<RegistrationInfo> registrations_;
   std::vector<NodeInfo> nodes_;
@@ -93,7 +104,7 @@ class Compiler {
   bool has_custom_ops = false;
   bool has_quantization = false;
   Option<TfLiteType> common_tensor_type;
-  Option<bool> common_tensor_is_variable;
+  bool has_is_variable = false;
 };
 
 }  // namespace tflmc
diff --git a/src/CustomOperators.cc b/src/CustomOperators.cc
index d5ae115..3d5859b 100644
--- a/src/CustomOperators.cc
+++ b/src/CustomOperators.cc
@@ -15,20 +15,19 @@ limitations under the License.
 
 #include "CustomOperators.h"
 
-#include <unistd.h>
-
-#include <iostream>
 
 // dynamic loading for custom operators
-#ifndef _WIN32
+#ifdef LINUX
+#include <unistd.h>
+#include <iostream>
 #include <dlfcn.h>
 
 tflmc::custom_operator_handle tflmc::LoadCustom(
-    tflite::AllOpsResolver *resolver) {
+    tflite::MicroOpResolver *resolver) {
   const char *filename = "./libtflite_micro_custom.so";
   void *custom_lib = dlopen(filename, RTLD_NOW);
   if (custom_lib) {
-    TfLiteStatus (*reg_fun)(tflite::AllOpsResolver * res);
+    TfLiteStatus (*reg_fun)(tflite::MicroOpResolver * res);
     // see "man dlopen" for an explanation of this nasty construct
     *(void **)(&reg_fun) = dlsym(custom_lib, "register_custom");
     char *error = dlerror();
@@ -53,9 +52,13 @@ void tflmc::UnloadCustom(tflmc::custom_operator_handle custom_lib) {
 }
 
 #else
-// anyone interested in implementing this for Windows (LoadLibrary+GetProcAddr)
+// Obviously, no chance of loading shared lib on semi-hosted embedded builds
+// of pre-interpeter.   
+// TODO: could it work on  user-space hosted execution on qemu?  Attractive option...
+// as stuff like command-line args ought to work correctly.
+// TODO: anyone interested in implementing this for Windows (LoadLibrary+GetProcAddr)
 tflmc::custom_operator_handle tflmc::LoadCustom(
-    tflite::AllOpsResolver *resolver) {
+    tflite::MicroOpResolver *resolver) {
   return nullptr;
 }
 
diff --git a/src/CustomOperators.h b/src/CustomOperators.h
index 3d7f60e..ff5383b 100644
--- a/src/CustomOperators.h
+++ b/src/CustomOperators.h
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tflmc {
 typedef void* custom_operator_handle;
 
-custom_operator_handle LoadCustom(tflite::AllOpsResolver* res);
+custom_operator_handle LoadCustom(tflite::MicroOpResolver* res);
 void UnloadCustom(custom_operator_handle);
 }  // namespace tflmc
 
diff --git a/src/Makefile.inc b/src/Makefile.inc
index eb5d7f3..48d1730 100644
--- a/src/Makefile.inc
+++ b/src/Makefile.inc
@@ -1,12 +1,11 @@
-$(info Adding tflite-micro compiler)
+
 TFLITE_U_COMPILER_SRCS := \
   src/CodeWriter.cc  src/CustomOperators.cc src/MemMap.cc src/TypeToString.cc \
-  src/Compiler.cc  src/main.cc src/RecordAllocations.cc
+  src/Compiler.cc  src/main.cc src/RecordAllocations.cc src/BuiltinAllocations.cc
 
 TFLITE_U_COMPILER_HDRS := \
-  src/CodeWriter.h src/Compiler.h src/CustomOperators.h src/MemMap.h src/RecordAllocations.h src/TypeToString.h
-
-
+  src/CodeWriter.h src/Compiler.h src/CustomOperators.h src/MemMap.h src/RecordAllocations.h \
+  src/TypeToString.h src/BuiltinAllocations.h src/ModelInfo.h
 
 
 # Builds a standalone binary.
diff --git a/src/MemMap.cc b/src/MemMap.cc
index 6a08945..d8eaab8 100644
--- a/src/MemMap.cc
+++ b/src/MemMap.cc
@@ -1,24 +1,163 @@
 #include "MemMap.h"
+#include "Options.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <iostream>
+#include <fstream>
 
-void tflmc::MemMap::recordROM(ptrdiff_t offset, size_t len,
+
+tflmc::SufficientArena::SufficientArena( size_t sufficient_size, size_t sufficient_alignment)
+{
+  size_t padded_size = sufficient_size + 2*sufficient_alignment;
+  arena_buf.resize(padded_size);
+  void *arena_start = arena_buf.data();
+  aligned_start_ =
+    static_cast<uint8_t *>(
+      std::align(sufficient_alignment, sufficient_alignment, arena_start, padded_size));
+  assert( aligned_start_!=nullptr && "Arena alignment failed");
+}
+
+tflmc::MemMap::MemMap() 
+  : m_total(0)
+{
+}
+
+
+
+void tflmc::MemMap::record(size_t len,
                               const std::string &tag) {
-  m_romEntries.push_back({offset, len, tag});
+  if (len > 0) {
+    m_entries.push_back({m_total, len, tag});
+    m_total += len;
+  }
+}
+
+tflmc::ArenaMemMap::ArenaMemMap() 
+{
+}
+
+
+void tflmc::ArenaMemMap::init(size_t model_op_count) {
+    m_node_scratchbuf_counts.resize(model_op_count,0); 
 }
 
-void tflmc::MemMap::recordRAM(ptrdiff_t offset, size_t len,
+void tflmc::ArenaMemMap::recordPersistent(ptrdiff_t offset, size_t len,
                               const std::string &tag) {
-  m_ramEntries.push_back({offset, len, tag});
+  m_entries.push_back({offset, len, tag});
+  updateUsedList(offset, len);
+}
+
+void tflmc::ArenaMemMap::recordScratchBuf(int idx,
+                                    ptrdiff_t offset, size_t len,
+                                    size_t allocating_node,
+                                    const std::string &tag) {
+  m_scratchbuf_map[idx] = m_entries.size();
+  recordPersistent(offset, len, tag);
+  m_node_scratchbuf_counts[allocating_node] += 1;
+}
+
+std::vector<ptrdiff_t> tflmc::ArenaMemMap::scratchBufOffsets() {
+    std::vector<ptrdiff_t> res;
+    for( auto &sb : m_scratchbuf_map )
+    {
+      assert(sb.first >= 0);
+      size_t req_sb_table_size = sb.first+1;
+      res.resize(std::max(req_sb_table_size,res.size()));
+      res[sb.first] = m_entries[sb.second].base;
+    }
+
+    return res;
+}
+
+void tflmc::ArenaMemMap::updateUsedList(ptrdiff_t used_begin, size_t used_len) {
+
+  ptrdiff_t used_end = used_begin + used_len;
+  std::vector<ptrdiff_t> to_delete;
+  auto overlapped_i = m_usedList.lower_bound(used_begin);
+  // Fuse used block overlapping on left
+  if (overlapped_i != m_usedList.begin())
+  {
+    --overlapped_i;
+    if (overlapped_i->second >= used_begin) {
+      used_begin = overlapped_i->first;
+    } else {
+      ++overlapped_i; 
+    }
+  }
+  
+
+  auto end_i = m_usedList.upper_bound(used_end);
+
+  // Fuse used blocks  overlapped completely or on
+  // right.
+  while(overlapped_i != end_i) {
+    // Invariant: overlapped_i->first >= used_begin
+    // Invariant: overlapped_i->first < used_end
+    to_delete.push_back(overlapped_i->first);
+    used_end = std::max(overlapped_i->second, used_end);
+    ++overlapped_i;
+  }     
+
+  // Fuse ... 
+  for (auto k : to_delete) {
+    m_usedList.erase(k);
+  }
+  m_usedList[used_begin] = used_end;
+}
+
+void tflmc::ArenaMemMap::stripLargestGap(size_t alignment_to_maintain) {
+
+  // Find largest  gap between used  blocks.
+  auto used_i = m_usedList.begin();
+  ptrdiff_t prev_end = 0;
+  ptrdiff_t max_gap_size = 0;
+  ptrdiff_t max_gap_begin = 0;
+  while( used_i != m_usedList.end() ) {
+    auto cur_begin = used_i->first;
+    auto cur_gap_size = cur_begin-prev_end;
+    if (cur_gap_size > max_gap_size) {
+      max_gap_size = cur_gap_size;
+      max_gap_begin = prev_end;
+    }
+    prev_end = used_i->second;
+    ++used_i;
+  }
+
+  // Adjust RAM entries to strip out largest gap... we get a little sneaky to maintain
+  // alginment by adjusting the gap_size so sufficient alignment is maintained.
+  max_gap_size = max_gap_size / alignment_to_maintain * alignment_to_maintain;
+  for (auto &entry : m_entries )
+  {
+    if (entry.base > max_gap_begin) {
+      assert( entry.base >= max_gap_begin+max_gap_size);
+      entry.base -= max_gap_size;
+    }
+  }
+
 }
 
+size_t tflmc::ArenaMemMap::size() const {
+
+  ptrdiff_t max_end = 0;
+  for (auto &entry : m_entries )
+  {
+    max_end = std::max(static_cast<ptrdiff_t>(entry.base+entry.len), max_end);
+  }
+  return max_end;
+}
+
+
+
 static void PrintBar(const std::string &label, float start, float end) {
   static const int BAR_WIDTH = 100;
   static const int TEXT_LABEL_START = 3;
 
   if (start == -1.0f) {
     for (int i = 0; i < BAR_WIDTH + 2; i++) {
-      printf("#");
+      std::cout << '#';
     }
-    printf("\n");
+    std::cout << std::endl;
     return;
   }
 
@@ -44,42 +183,33 @@ static void PrintBar(const std::string &label, float start, float end) {
     }
   }
 
-  printf("#");
+  std::cout << '#';
   for (int i = 0; i < BAR_WIDTH; i++) {
     if (i >= labelStart && i < labelEnd) {
-      printf("%c", label[i - labelStart]);
+      std::cout << label[i - labelStart];
     } else if (i >= barStart && i < barEnd) {
-      printf(smallBar ? "|" : "X");
+      std::cout <<  (smallBar ? "|" : "X");
     } else {
-      printf(".");
+      std::cout << '.';
     }
   }
-  printf("#\n");
+ std::cout << '#' << std::endl;
 }
 
-void tflmc::MemMap::report() const {
-  size_t constSize = 0;
-  size_t arenaSize = 0;
-  for (const auto &entry : m_romEntries) {
-    constSize = std::max(constSize, entry.base + entry.len);
-  }
-  for (const auto &entry : m_ramEntries) {
-    arenaSize = std::max(arenaSize, entry.base + entry.len);
-  }
 
-  printf("ROM summary: %lu bytes total\n", constSize);
-  PrintBar("", -1.0f, -1.0f);
-  for (const auto &entry : m_romEntries) {
-    PrintBar(entry.tag, entry.base / (float)constSize,
-             (entry.base + entry.len) / (float)constSize);
-  }
-  PrintBar("", -1.0f, -1.0f);
+void tflmc::MemMap::report(const char *label) const {
+  tflmc::Options &options = tflmc::Options::instance();
+  size_t usage = size();
 
-  printf("RAM summary: %lu bytes total\n", arenaSize);
-  PrintBar("", -1.0f, -1.0f);
-  for (const auto &entry : m_ramEntries) {
-    PrintBar(entry.tag, entry.base / (float)arenaSize,
-             (entry.base + entry.len) / (float)arenaSize);
+
+  std::cout << label << " summary: " <<usage << " bytes total" << std::endl;
+
+  if (options.verbose) {
+    PrintBar("", -1.0f, -1.0f);
+    for (const auto &entry : m_entries) {
+      PrintBar(entry.tag, entry.base / (float)usage,
+              (entry.base + entry.len) / (float)usage);
+    }
+    PrintBar("", -1.0f, -1.0f);
   }
-  PrintBar("", -1.0f, -1.0f);
 }
diff --git a/src/MemMap.h b/src/MemMap.h
index 55295b1..7f8eafb 100644
--- a/src/MemMap.h
+++ b/src/MemMap.h
@@ -3,25 +3,136 @@
 
 #include <string>
 #include <vector>
+#include <map>
 #include <cstddef>
 
 namespace tflmc {
 
+
+
+struct SufficientArena
+{
+public:
+  SufficientArena( size_t sufficient_size, size_t sufficient_alignment);
+
+  uint8_t *alginedBufferStart() { return aligned_start_; }
+protected:
+  std::vector<uint8_t> arena_buf;
+  uint8_t *aligned_start_;
+
+};
+
+
 // Keeps track of buffers and prints a summary.
 class MemMap {
  public:
-  void recordROM(ptrdiff_t offset, size_t len, const std::string &tag);
-  void recordRAM(ptrdiff_t offset, size_t len, const std::string &tag);
-  void report() const;
 
- private:
+  MemMap();
+  
+  /**
+   * @brief Record allocated memory section 
+   * 
+   * Primarily these will be data from constant tensors or constant tensor meta-data
+   * Since ROM cannot be re-used assumged location is simply counted internally
+   * hence no offset parameter.
+   * 
+   * @param len   Allocated size (may no account for alignment padding)
+   * @param tag   identifying tag for diagnostic/analytic output
+   */
+  void record(size_t len, const std::string &tag);
+
+  void report(const char *label) const;
+
+
+  virtual size_t size() const { return m_total; }
+
+ protected:
+
   struct Entry {
     ptrdiff_t base;
     size_t len;
     std::string tag;
   };
-  std::vector<Entry> m_romEntries;
-  std::vector<Entry> m_ramEntries;
+
+  std::vector<Entry> m_entries;
+ 
+  ptrdiff_t m_total;
+};
+
+
+// Keeps track of buffers and prints a summary.
+class ArenaMemMap : public MemMap
+{
+ public:
+
+  ArenaMemMap();
+  
+  /**
+   * @brief Initialize per-op tables (scratch buffer offset etc)
+   * 
+   * This can't be done at construction time as number of ops is not
+   * available to pre-intreter "Compiler" sub-object construction time
+   * (tflite interpreter has yet to be created).
+   */
+
+  void init(size_t model_op_count);
+
+  /**
+   * @brief Record persistent tensor arena-allocatio).
+   * 
+   * Primarily these will be persistent data buffers for intermediate
+   * tensor values.  Due to differing lifetimes it is quite legal/normal
+   * for these to overlap.
+   * @param offset  Starting offset in tensor arena
+   * @param len     Length in bytes
+   * @param tag     identifying tag for diagnoistic/analytic output.
+   */
+  void recordPersistent(ptrdiff_t offset, size_t len, const std::string &tag);
+
+  /**
+   * @brief Record scatch tensor area-allocatino)
+   * 
+   * Scratch buffers (buffers allocated only for the duration of a single operator
+   * evaluation) are handled seperately from longer-lived tensor arena allocations
+   * (intermediate-value tensors and persistent buffers).  Presumably this
+   * to minimize number items processed by the full (expensive) memory allocation algorithm.
+   * 
+   * 
+   * @param idx     Scratch buffer index (handle)
+   * @param offset  Starting offset in tensor arena
+   * @param len     Buffer length in bytes
+   * @param tag     identifying tag for diagnoistic/analytic output.
+   */
+
+  void recordScratchBuf(int idx, ptrdiff_t offset, size_t len, size_t allocating_node, const std::string &tag);
+
+  std::vector<ptrdiff_t> scratchBufOffsets();
+
+  typedef std::vector<uint8_t> scratchbuf_counts_map_t;
+  inline const scratchbuf_counts_map_t &nodesScratchBufferAllocationCounts() {
+    return m_node_scratchbuf_counts;
+  }
+
+  void stripLargestGap(size_t alginment_to_maintain);
+  virtual size_t size() const;
+
+ private:
+
+  void updateUsedList(ptrdiff_t used_base, size_t used_len);
+
+  // [begin,end) of unused memory sections
+  typedef std::map<ptrdiff_t, ptrdiff_t> occupancy_map_t;
+  occupancy_map_t m_usedList;
+
+  // Table of RAM allocations associated with scratchbufs.
+  typedef std::map<int, size_t>  scratchbuf_map_t;
+  scratchbuf_map_t m_scratchbuf_map;
+
+  // Table of number of scratch buffers assigned by each node.
+  // This is needed to correctly assign scratch buffer indexes in the
+  // prepare phase for nodes that do use statically code-generated user_data OpData.
+
+  scratchbuf_counts_map_t m_node_scratchbuf_counts;
 };
 
 }  // namespace tflmc
diff --git a/src/ModelInfo.h b/src/ModelInfo.h
new file mode 100644
index 0000000..ef7e546
--- /dev/null
+++ b/src/ModelInfo.h
@@ -0,0 +1,52 @@
+#ifndef TFLMCOMPILER_MODELINFO_H
+#define TFLMCOMPILER_MODELINFO_H
+
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflmc {
+
+struct TensorInfo {
+  TensorInfo(const TfLiteTensor *tensor_ptr) :
+    tensor(tensor_ptr)
+  {}
+  const TfLiteTensor *tensor = nullptr;
+};
+struct RegistrationInfo {
+  const TfLiteRegistration *reg = nullptr;
+  tflite::BuiltinOperator code;
+  std::string custom_name;
+  bool operator==(const RegistrationInfo &other) {
+    if (code != other.code) return false;
+    if (code == tflite::BuiltinOperator_CUSTOM) {
+      return custom_name == other.custom_name;
+    } else
+      return true;
+  }
+};
+struct NodeInfo {
+  NodeInfo() {}
+  NodeInfo(TfLiteNode tfl_node, ptrdiff_t reg_index) :
+    node(tfl_node),
+    regIndex(reg_index)
+  {}
+  TfLiteNode node;
+  ptrdiff_t regIndex = -1;
+};
+template <class T>
+struct Option {
+  bool None = true;
+  T Some = T();
+  void operator=(T const &val) {
+    None = false;
+    Some = val;
+  }
+  void clear() {
+    Some = T();
+    None = true;
+  }
+};
+
+}  // namespace tflmc
+
+#endif  // TFLMCOMPILER_MODELINFO_H
diff --git a/src/Options.h b/src/Options.h
new file mode 100644
index 0000000..e3a4b02
--- /dev/null
+++ b/src/Options.h
@@ -0,0 +1,23 @@
+#ifndef TFLMCOMPILER_OPTIONS_H
+#define TFLMCOMPILER_OPTIONS_H
+
+#include <string>
+
+namespace tflmc {
+class Options
+{
+private:
+    Options() {}
+public:
+    bool verbose = false;
+    std::string  memmap_json;
+
+    static Options &instance() {
+        static Options options;
+        return options;
+    }
+};
+
+}
+
+#endif // TFLMCOMPILER_OPTIONS_H
diff --git a/src/RecordAllocations.cc b/src/RecordAllocations.cc
index dc7b3ce..e49f4c8 100644
--- a/src/RecordAllocations.cc
+++ b/src/RecordAllocations.cc
@@ -1,47 +1,176 @@
 #include <sstream>
+#include <memory>
+#include <map>
+
+#if !TFLMC_USE_INTERPRETER_HOOKS
 #define private public
+#endif
 #include "tensorflow/lite/micro/micro_interpreter.h"
+#if !TFLMC_USE_INTERPRETER_HOOKS
 #undef private
+#endif
 
 #include "CustomOperators.h"
 #include "RecordAllocations.h"
+#include "MemMap.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 static std::vector<tflmc::Allocation> g_loggedAllocations;
-static tflite::MicroAllocator *g_allocator;
-static int g_currentNodeIndex = -1;
+static std::vector<uint8_t>   g_nodeScratchBufferAllocations;
+static size_t g_currentNodeIndex = -1;
 static uint8_t *g_arenaPtr = nullptr;
 
 static ptrdiff_t g_arena_size = 0;
 
-static void* LoggingAllocatePersistentBuffer(struct TfLiteContext *ctx,
+
+struct ScratchBufferInfo {
+    	size_t node_id;
+      size_t bytes;
+};
+
+static std::map<int, ScratchBufferInfo> g_logged_scratch_buffers;
+
+
+
+#if TFLMC_USE_INTERPRETER_HOOKS
+
+
+static  tflite::MicroInterpreter::TfLiteContextHooks *g_tflm_hooks;
+
+
+static void *LoggingAllocatePersistentBuffer(struct TfLiteContext *ctx, size_t bytes) {
+  auto ptr =  g_tflm_hooks->AllocatePersistentBuffer(ctx, bytes);
+  assert(ptr != nullptr && "Alloc failure");
+  ptrdiff_t offset = (uint8_t *)ptr - g_arenaPtr;
+
+  g_loggedAllocations.push_back(
+      {offset, bytes,
+       g_currentNodeIndex, tflmc::AllocKind::Persistent, 0});
+  return ptr;
+}
+
+
+static TfLiteStatus LoggingRequestScratchBufferInArena(TfLiteContext *ctx,
+                                                       size_t bytes,
+                                                       int *buffer_idx) {
+
+  auto res = g_tflm_hooks->RequestScratchBufferInArena(ctx, bytes,  buffer_idx);
+  if (res == kTfLiteOk) {
+    g_logged_scratch_buffers[*buffer_idx] = {g_currentNodeIndex, bytes};
+  }
+  return res;                                         
+}
+
+
+
+static void* LoggingGetScratchBuffer(struct TfLiteContext* ctx, int buffer_idx) {
+  return g_tflm_hooks->GetScratchBuffer (ctx, buffer_idx);
+}
+
+static void LoggingNotifyNodeIndex(const struct TfLiteContext* context,
+                                size_t node) {
+  g_currentNodeIndex = node;
+  return g_tflm_hooks->NotifyNodeIndex(context, node);
+}
+
+static  tflite::MicroInterpreter::TfLiteContextHooks g_recording_hooks =
+{
+  LoggingAllocatePersistentBuffer,
+  LoggingRequestScratchBufferInArena,
+  LoggingGetScratchBuffer,
+  LoggingNotifyNodeIndex
+  
+};
+
+void tflmc::SetRecordAllocationhooks( tflite::MicroInterpreter *interpreter, 
+                              uint8_t *arena_start,
+                              size_t arena_size) {
+  g_tflm_hooks = interpreter->getHooks();
+  g_arenaPtr = arena_start;
+  g_arena_size = arena_size;
+  interpreter->setHooks(&g_recording_hooks);
+}
+
+void  tflmc::RecordScratchBufferAllocations(tflite::MicroInterpreter *interpreter)
+{
+  auto ctx = interpreter->getTFLContext();
+  for( auto &sb_i : g_logged_scratch_buffers )
+  {
+      auto sb_idx = sb_i.first;
+      void *sb_start = g_tflm_hooks->GetScratchBuffer(ctx, sb_idx );
+      assert(sb_start != nullptr && "Unknown Scratch Buffer");
+      ptrdiff_t offset = (uint8_t *)sb_start - g_arenaPtr;
+      g_loggedAllocations.push_back(
+        {offset, sb_i.second.bytes,
+         sb_i.second.node_id, tflmc::AllocKind::Scratch, sb_i.first});
+
+      size_t node_id = sb_i.second.node_id;
+      if (g_nodeScratchBufferAllocations.size() <= node_id) {
+        g_nodeScratchBufferAllocations.resize(node_id+1, 0);
+      }
+      g_nodeScratchBufferAllocations[node_id] += 1;
+  }
+
+}
+
+
+
+TfLiteEvalTensor *tflmc::GetEvalTensor(tflite::MicroInterpreter *interpreter, int i) {
+  auto ctx = interpreter->getTFLContext();
+  return ctx->GetEvalTensor(ctx, i);
+}
+
+TfLiteTensor *tflmc::GetTensor(tflite::MicroInterpreter *interpreter, int i) {
+  auto ctx = interpreter->getTFLContext();
+  return ctx->GetTensor(ctx, i);
+}
+
+#else
+
+static tflite::MicroAllocator *g_allocator;
+static void *LoggingAllocatePersistentBuffer(struct TfLiteContext *ctx,
                                                     size_t bytes) {
-  void* ptr = g_allocator->AllocatePersistentBuffer(bytes);
-  assert(ptr!=nullptr && "Alloc failure");
+  auto ptr = g_allocator->AllocatePersistentBuffer(bytes);
+  assert(ptr != nullptr && "Alloc failure");
+  ptrdiff_t offset = (uint8_t *)ptr - g_arenaPtr;
+
   g_loggedAllocations.push_back(
-      {-(g_arenaPtr - (uint8_t *)ptr + g_arena_size), bytes,
-       g_currentNodeIndex});
+      {offset, bytes,
+       g_currentNodeIndex, tflmc::AllocKind::Persistent, -1});
   return ptr;
 }
+
 static TfLiteStatus LoggingRequestScratchBufferInArena(TfLiteContext *ctx,
                                                        size_t bytes,
                                                        int *buffer_idx) {
   assert(false && "Not handling scratch buffers currently");
-  return g_allocator->RequestScratchBufferInArena(g_currentNodeIndex, bytes,
+  return g_allocator->RequestScratchBufferInArena(bytes,
                                                   buffer_idx);
 }
 
-std::vector<tflmc::Allocation> tflmc::RecordAllocations(
-    const tflite::Model *model, ptrdiff_t arena_size) {
+
+  // HACK: here in essence, we create a duplicate interpreter here and re-execute
+  // Fragmnents of MicroInterpreter::AllocateTensors() with instrumented context
+  // API calls.  
+
+void tflmc::RecordAllocations(
+    const tflite::Model *model, 
+     size_t arena_size,  size_t arena_alignment) {
+
+  tflmc::SufficientArena arena(arena_size, arena_alignment);
+  g_arenaPtr = arena.alginedBufferStart();
   g_arena_size = arena_size;
-  std::vector<uint8_t> arena_buf(g_arena_size);
-  g_arenaPtr = arena_buf.data();
 
   tflite::MicroErrorReporter error_reporter;
+
+  // Resolver must be passed in  as otherwise pointers to its internal table
+  // in the arena will be invalidated....
+
+  
   tflite::AllOpsResolver resolver;
   tflmc::custom_operator_handle custom = tflmc::LoadCustom(&resolver);
-  tflite::MicroInterpreter interpreter(model, resolver, arena_buf.data(),
+  tflite::MicroInterpreter interpreter(model, resolver, g_arenaPtr,
                                        g_arena_size, &error_reporter);
 
   auto ctx = &interpreter.context_;
@@ -49,14 +178,18 @@ std::vector<tflmc::Allocation> tflmc::RecordAllocations(
 
   tflite::NodeAndRegistration *nodeAndRegs;
   TfLiteEvalTensor *eval_tensors=nullptr;
+  tflite::ScratchBufferHandle* scratchhandle=nullptr;
+
   allocator->StartModelAllocation(model, resolver, &nodeAndRegs, &eval_tensors);
-  allocator->FinishModelAllocation(model, eval_tensors);
+  allocator->FinishModelAllocation(model, eval_tensors, &scratchhandle);
 
   g_allocator = allocator;
   ctx->AllocatePersistentBuffer = &LoggingAllocatePersistentBuffer;
   ctx->RequestScratchBufferInArena = nullptr;
+  auto ctx_GetScratchBuffer =  ctx->GetScratchBuffer;
   ctx->GetScratchBuffer = nullptr;
 
+
   auto subgraph = model->subgraphs()->Get(0);
   for (size_t i = 0; i < subgraph->operators()->size(); i++) {
     auto node = &nodeAndRegs[i].node;
@@ -67,6 +200,7 @@ std::vector<tflmc::Allocation> tflmc::RecordAllocations(
     }
   }
 
+
   ctx->RequestScratchBufferInArena = &LoggingRequestScratchBufferInArena;
 
   for (size_t i = 0; i < subgraph->operators()->size(); i++) {
@@ -76,11 +210,26 @@ std::vector<tflmc::Allocation> tflmc::RecordAllocations(
       g_currentNodeIndex = i;
       reg->prepare(ctx, node);
     }
+    allocator->ResetTempAllocations();
   }
+  
+  allocator->FinishModelAllocation(model, eval_tensors);
+
   tflmc::UnloadCustom(custom);
-  return g_loggedAllocations;
+  for( auto &sb_i : g_logged_scratch_buffers )
+  {
+      auto sb_idx = sb_i.first;
+      void *sb_start = ctx_GetScratchBuffer( ctx, sb_idx );
+      assert(sb_start != nullptr && "Unknown Scratch Buffer");
+      ptrdiff_t offset = (uint8_t *)sb_start - g_arenaPtr;
+      g_loggedAllocations.push_back(
+        {offset, sb_i.second.bytes,
+         sb_i.second.node_id, tflmc::AllocKind::Scratch, -1});
+  }
+
 }
 
+
 TfLiteEvalTensor *tflmc::GetEvalTensor(tflite::MicroInterpreter *interpreter, int i) {
   auto ctx = &interpreter->context_;
   return ctx->GetEvalTensor(ctx, i);
@@ -90,3 +239,9 @@ TfLiteTensor *tflmc::GetTensor(tflite::MicroInterpreter *interpreter, int i) {
   auto ctx = &interpreter->context_;
   return ctx->GetTensor(ctx, i);
 }
+
+#endif
+
+const std::vector<tflmc::Allocation> &tflmc::RecordedAllocations() { 
+  return g_loggedAllocations; 
+}
diff --git a/src/RecordAllocations.h b/src/RecordAllocations.h
index a8848ac..ece4ec2 100644
--- a/src/RecordAllocations.h
+++ b/src/RecordAllocations.h
@@ -6,13 +6,49 @@
 
 namespace tflmc {
 
+enum AllocKind : int {
+  Persistent,
+  Scratch
+};
+
 struct Allocation {
   ptrdiff_t offset;
   size_t len;
-  int nodeIndex;
+  size_t nodeIndex;
+  AllocKind kind;
+  int buffer_index;
 };
 
-std::vector<Allocation> RecordAllocations(const tflite::Model *model, ptrdiff_t arena_size);
+
+
+  // We can try to use a stock kernel but
+  // this requires us to access private data and re-execute
+  // Fragmnents of MicroInterpreter::AllocateTensors() with instrumented context
+  // API calls.  Painful to maintain and prone subtle Bugs.  Simpler to maintain a patch
+  // that adds hooks to MicroInterpreter to gather data from an 
+  // actual MicroInterpreter::AllocateTensors() by intercepting the TfliteContext vectors
+  // which are a reasonably stable API.
+
+#if TFLMC_USE_INTERPRETER_HOOKS
+
+void SetRecordAllocationhooks(tflite::MicroInterpreter *interpreter,  
+                              uint8_t *arena_start,
+                              size_t arena_size);
+
+void RecordScratchBufferAllocations(tflite::MicroInterpreter *interpreter);
+
+
+#else
+void RecordAllocations(
+  const tflite::Model *model,  size_t arena_size, size_t arena_alignment);
+#endif
+
+/**
+ * @brief Allocations from tensor arena with type and associated node /scratch buffer index.
+ * 
+ * @return const std::vector<Allocation>& 
+ */
+const std::vector<Allocation> &RecordedAllocations();
 
 
 TfLiteEvalTensor *GetEvalTensor(tflite::MicroInterpreter *interpreter, int i);
diff --git a/src/TypeToString.cc b/src/TypeToString.cc
index 2747904..57a4f8c 100644
--- a/src/TypeToString.cc
+++ b/src/TypeToString.cc
@@ -67,7 +67,7 @@ std::string tflmc::to_string(TfLiteFusedActivation t) {
   switch (t) {
     NAME(kTfLiteActNone);
     NAME(kTfLiteActRelu);
-    NAME(kTfLiteActRelu1);
+    NAME(kTfLiteActReluN1To1);
     NAME(kTfLiteActRelu6);
     NAME(kTfLiteActTanh);
     NAME(kTfLiteActSignBit);
diff --git a/src/main.cc b/src/main.cc
index 89922f1..418ed7a 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -1,22 +1,85 @@
 #include "CodeWriter.h"
 #include "Compiler.h"
+#include "Options.h"
 
-int main(int argc, char *argv[]) {
-  if (argc < 3 || argc > 4) {
+#define LOG_ARGC_ARGV 1
+
+
+/** The "real" main - physical main has workarounds for various
+ * semi-hosting environments
+ */
+
+int hosted_main(int argc, char *argv[]) {
+
+  tflmc::Options &options = tflmc::Options::instance();
+  int cur_arg = 1;
+  bool usage_error = false;
+  while (cur_arg < argc && !usage_error) {
+
+    const std::string verbose_flag("--verbose");
+    const std::string memory_flag("--mem_summary");
+    if (verbose_flag == argv[cur_arg]) {
+      options.verbose = true;
+    } else if (memory_flag == argv[cur_arg]) {
+      if (cur_arg+1 < argc) {
+        options.memmap_json = argv[cur_arg+1];
+        ++cur_arg;
+      } else {
+        usage_error = true;
+      }
+    } else if (argv[cur_arg][0] == '-') {
+      // No other "flag"
+      usage_error = true;
+    } else {
+      break;
+    }
+    ++cur_arg;
+  }
+  if (cur_arg+1 >= argc || cur_arg+4 < argc) {
     printf(
-        "Usage: %s modelFile.tflite outFile.cpp [NamingPrefix = \"model_\"]\n",
+        "Usage: %s [--verbose] [--mem_summary filename.json] modelFile.tflite outputSrcFile outputHdrFile [NamingPrefix (default: \"model_\")]\n",
         argv[0]);
     return 1;
   }
 
   std::string prefix = "model_";
-  if (argc == 4) {
-    prefix = argv[3];
+  if (cur_arg+3 < argc) {
+    prefix = argv[cur_arg+3];
   }
 
-  if (!tflmc::CompileFile(argv[1], argv[2], prefix)) {
+  if (!tflmc::CompileFile(argv[cur_arg], argv[cur_arg+1], argv[cur_arg+2], prefix)) {
     return 1;
   }
 
   return 0;
 }
+
+#ifdef __ARMCOMPILER_VERSION
+extern "C" int arm_sh_parse_cmdline( char ***p_argv);
+extern "C" void arm_sh_exit(int code);
+#endif
+
+int main(int argc, char *argv[]) {
+  
+#ifdef __ARMCOMPILER_VERSION
+  // ARMClang runtime library has a very low (undocumented) maximum command line length
+  // for its internal argv parsing - exceeding it silently results in empty argv/argc.
+  argc = arm_sh_parse_cmdline(&argv);
+#endif
+#if LOG_ARGC_ARGV
+  printf( "ARGC=%d ", argc);
+  for(int i=0; i < argc; ++i) {
+    printf(",%s", argv[i]);
+  }
+  printf("\n");
+#endif
+  int status = hosted_main(argc,argv);
+#ifdef __ARMCOMPILER_VERSION
+  if (status) {
+    // ARMClang runtime library ignores exit status it always exits
+    // angel_SWIreason_ReportException with ADP_Stopped_ApplicationExit
+    arm_sh_exit(status);
+  }
+#endif
+  return status;
+}
diff --git a/tflite_u_preint/static_data_utils.cc b/tflite_u_preint/static_data_utils.cc
new file mode 100644
index 0000000..fa1d51e
--- /dev/null
+++ b/tflite_u_preint/static_data_utils.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// PORTABLE OPTIMIZED
+
+// Support recording of selected kernel variant in prepare phase for static extraction for
+// a fixed tflite model.
+
+// TF_LITE_MICRO_RECORD_OP_USER_DATA: 
+//  When set the names of kernel variants eval functions recorded and can be dumped
+// via PointerCollect API.
+// TF_LITE_MICRO_USE_OFFLINE_OP_USER_DATA
+//   When set prepare phase kernel variant selection code is dropped with 
+// the eval functions recorded in tflite::micro::kernels::conv::eval_functions used instead.
+//
+// Benefits smaller binary, used unnecessary eval function variants are not lnked.
+
+
+
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+
+#include "tflite_u_preint/static_init_support.h"
+#include "tensorflow/lite/micro/kernels/conv.h"
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+
+namespace tflite {
+namespace micro {
+
+
+
+CppPODStructInitializer TfLitePaddingValuesSubStruct(TfLitePaddingValues &pv) {
+
+  auto init = new CppItems();
+  *init 
+    << pv.width
+    << pv.height
+    << pv.width_offset
+    << pv.height_offset;
+
+  CppPODStructInitializer res(init);
+  return res;
+}
+
+
+
+CppPODStructInitializer TfLiteOpDataConvSubStruct(OpDataConv &odc, size_t output_depth) {
+
+  auto init = new CppItems();
+  *init 
+    << TfLitePaddingValuesSubStruct(odc.padding)
+    << odc.input_zero_point
+    << odc.filter_zero_point
+    << odc.output_zero_point
+    << odc.output_multiplier
+    << odc.output_shift
+    << tflite::micro::CppNamedVec<int32_t>("per_channel_output_multiplier", "int32_t",
+                            odc.per_channel_output_multiplier, output_depth)
+    << tflite::micro::CppNamedVec<int32_t>("per_channel_output_shift", "int32_t",
+                            odc.per_channel_output_shift, output_depth)
+    << odc.output_activation_min
+    << odc.output_activation_max;
+  CppPODStructInitializer res(init);
+  return res;
+}
+
+
+CppNamedStruct TfLiteCustomSub8BitPackingDetailsStructPtr(const char *name, const TfLiteCustomSub8BitPackingDetails &pv) {
+
+  auto init = new CppItems();
+  *init 
+    << pv.bits_per_item
+    << pv.container_bits
+    << pv.packed_minor_dims
+    << pv.sparsity_coding
+    << "{}"; // Empty initializer 
+  CppNamedStruct res(name, "const TfLiteCustomSub8BitPackingDetails", init);
+  return res;
+}
+
+
+}  // namespace micro
+}  // namespace tflite
+
+#endif // TF_LITE_MICRO_RECORD_OP_USER_DATA
diff --git a/tflite_u_preint/static_data_utils.h b/tflite_u_preint/static_data_utils.h
new file mode 100644
index 0000000..3d99e81
--- /dev/null
+++ b/tflite_u_preint/static_data_utils.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TFLMCOMPILER_STATIC_DATA_UTILS_H_
+#define TFLMCOMPILER_STATIC_DATA_UTILS_H_
+
+#include "tflite_u_preint/static_init_support.h"
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+namespace tflite {
+    struct OpDataConv;
+
+namespace micro {
+
+
+
+CppPODStructInitializer TfLitePaddingValuesSubStruct(TfLitePaddingValues &pv);
+
+CppPODStructInitializer TfLiteOpDataConvSubStruct(OpDataConv &odc, size_t output_depth);
+
+CppNamedStruct TfLiteCustomSub8BitPackingDetailsStructPtr(const char *name, const TfLiteCustomSub8BitPackingDetails &pv);
+
+
+}  // namespace micro
+}  // namespace tflite
+
+#endif 
+#endif // #ifndef TFLMCOMPILER_STATIC_DATA_UTILS_H_
diff --git a/tflite_u_preint/static_init_support.cc b/tflite_u_preint/static_init_support.cc
new file mode 100644
index 0000000..a03b2ab
--- /dev/null
+++ b/tflite_u_preint/static_init_support.cc
@@ -0,0 +1,430 @@
+/*
+ * static_init_support.cc
+ *
+ *  Created on: 10.08.2020
+ *      Author: stevensa
+ */
+
+#include "tflite_u_preint/static_init_support.h"
+
+#if TF_LITE_MICRO_AUTO_DUMPED_OPDATA
+#include "tensorflow/lite/micro/kernels/ifx_common/offline_prepare_utils.h"
+#endif
+
+#include <cstddef>
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <set>
+
+
+namespace tflite {
+namespace micro {
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA 
+
+// Vector: needs a named sub-initializer that has to be output first
+CppItems &CppItems::operator<<(const char *literal) {
+  elements_.push_back(
+      std::unique_ptr<CppInitializerBase>(new CppLiteral(literal)));
+  return *this;
+}
+
+CppItems &CppItems::operator<<(float value) {
+  elements_.push_back(std::unique_ptr<CppInitializerBase>(
+      new CppPrimitiveInitializer<float>(value)));
+  return *this;
+}
+
+CppItems &CppItems::operator<<(const CppNamedStruct &structref) {
+  elements_.push_back(std::unique_ptr<CppInitializerBase>(
+      new CppNamedStruct(structref)));
+  return *this;
+}
+
+CppItems &CppItems::operator<<(const CppPODStructInitializer &substruct) {
+  elements_.push_back(std::unique_ptr<CppInitializerBase>(
+      new CppPODStructInitializer(substruct)));
+  return *this;
+}
+
+
+// TODO Fold into  CppInitializerCollector
+
+class BaseCollector {
+ public:
+  BaseCollector() {}
+
+  void recordLiteralForPointer(void *ptr, const std::string &identifier) {
+    pointer_literals_[ptr] = identifier;
+  }
+
+  std::string getLiteralForPointer(void *ptr) {
+    std::string res;
+    auto lit_i = pointer_literals_.find(ptr);
+    if (lit_i != pointer_literals_.end()) {
+      res = lit_i->second;
+    }
+    return res;
+  }
+
+ protected:
+  // LUT to find name for pointer (mainly intendded for function pointers)
+  std::map<void *, std::string> pointer_literals_;
+  std::string output_path_;
+};
+
+//
+// singleton owning all all pointer collector implementations
+// Used to implement auto-dump on exit  without dependency
+// on static object destruction ordering.
+//
+
+class CppInitializerCollector : public BaseCollector {
+protected:
+  CppInitializerCollector();
+ public:
+  static CppInitializerCollector &instance();
+
+  void recordOpDataHeaders(const char *op_name, const char *headers,
+                           const char *type);
+
+  void recordStaticOpdata(const char *op_name, CppItems *op_data);
+
+  void writeStaticOpDataHeaders(std::ostream &os);
+
+  void writeStaticOpDataDefinitions(const std::string &prefix, std::ostream &os);
+
+  size_t constDataSize() const;
+
+  size_t initDataSize() const;
+
+  size_t uninitDataSize() const;
+  
+  // Scratch buffer recording suuproted only for unit-testing static op data recording
+  // auto-dump.  Post-compiler intercepts all Allocation requests itself
+  
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+  int recordScratchBuffer(ptrdiff_t offset_from_head);
+
+  ptrdiff_t getRecordedScratchBufferStart(int globally_unique_buf_idx);
+
+  void writeRecordedScratchBufferAllocations(std::ostream &os);
+
+  void codegenRecordedOpdata() {
+    std::fstream myfile(
+        "gen/autodumped_src/static_eval_tables.cc", std::fstream::out | std::fstream::trunc);
+    myfile << "#include \"tensorflow/lite/c/common.h\"\n"
+              "#include \"tensorflow/lite/c/builtin_op_data.h\"\n"
+              "#include \"tflite_u_preint/static_init_support.h\"\n"
+              "\n";
+    writeStaticOpDataHeaders(myfile);
+    myfile << "\n";
+    writeStaticOpDataDefinitions("autorecord_", myfile);
+    myfile << "\n";
+
+    // Needed for unit-tests as KernelRunner (etc) don't inject recording
+    // of buffer Allocation
+    writeRecordedScratchBufferAllocations(myfile);
+    myfile.close();
+  }
+
+  static void autoDumpOpDataTables() {
+    instance().codegenRecordedOpdata();
+  }
+
+  ~CppInitializerCollector() {
+  }
+ 
+#endif
+
+  std::map<std::string, std::string> op_headers_;
+
+  // Map associating operator supporting static initializatino data
+  // with required headers  (identified via node pointer)
+  // with recorded C++ static initialization data
+  std::map<std::string, std::unique_ptr<CppNamedStructVecInitializer>>
+      per_inst_user_data_;
+
+
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+  /**
+   * @brief Allocated scratch buffer starts in tensor arena
+   * 
+   */
+  std::vector<ptrdiff_t>  scratch_buf_allocations_;
+#endif
+    /**
+   * @brief Recorded per-instance op user_data sequence
+   * 
+   * Per-op user data in order of op invocation (identified by op-type and
+   * instance in model execution order)
+   */
+
+  struct OpInstUserData {
+    std::string op_name;        //!< Op type name
+    size_t      user_data_idx;  //!< Instance of op type in model
+  };
+
+  std::vector<OpInstUserData> op_user_data_;
+
+};  
+
+
+CppInitializerCollector::CppInitializerCollector() 
+  {
+  }
+
+
+CppInitializerCollector &CppInitializerCollector::instance() {
+
+  /* We manually created a object destructed on exit as not all our
+    embedded/semi-hosted environments seem to support C++ static object
+    destruction on exit */
+  static CppInitializerCollector *inst = nullptr;
+  if( inst != nullptr) {
+    return *inst;
+  }
+  inst = new CppInitializerCollector;
+
+  // For autodump based testing we generate C++ source with
+  // the captured op user_data and buffer memory allocations...
+  // on exit...
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+  atexit(CppInitializerCollector::autoDumpOpDataTables);
+#endif
+  return *inst;
+}
+
+void CppInitializerCollector::recordOpDataHeaders(const char *op_name,
+                                                  const char *headers,
+                                                  const char *op_data_type) {
+  std::string key(op_name);
+  auto &headers_for_op = op_headers_[key];
+  assert(headers_for_op.empty());
+  headers_for_op = std::string(headers);
+
+  // Create a Named struct vector record to hold the per-instance op user_data
+  // for instances of this operator type. 
+  auto &op_user_data = per_inst_user_data_[key];
+  op_user_data.reset(
+      new CppNamedStructVecInitializer("op_user_data", op_data_type));
+}
+
+void CppInitializerCollector::recordStaticOpdata(const char *op_name,
+                                                 CppItems *op_data) {
+  std::string key(op_name);
+  auto &inst_user_data = per_inst_user_data_[key];
+  size_t inst_idx = inst_user_data->getSize();
+  auto pod_init = new CppPODStructInitializer(op_data);
+  inst_user_data->pushBackElt(pod_init);
+
+
+  // Record reference to op-data to provide to this op instance during execution
+  OpInstUserData user_data_ref = {op_name, inst_idx};
+  op_user_data_.push_back(user_data_ref);
+}
+
+
+
+void CppInitializerCollector::writeStaticOpDataHeaders(std::ostream &os) {
+  for (auto &hdr_i : op_headers_) {
+    os << hdr_i.second;
+    os << "\n";
+  }
+}
+
+void CppInitializerCollector::writeStaticOpDataDefinitions(const std::string &prefix, std::ostream &os) {
+  os << "namespace tflite {\n"
+        "namespace ops {\n"
+        "namespace micro {\n\n";
+ // Op user_data tables (one per op-type supporting offline pre-computed user-data)
+  for (auto &id_i : per_inst_user_data_) {
+    os << "namespace " << id_i.first << " {\n\n";
+    id_i.second->cppDefinition(os, prefix);
+    os << "} // namespace " << id_i.first << "\n\n";
+  }
+
+  os << "} // namespace micro\n"
+        "} // namespace ops\n\n"
+
+        "namespace micro {\n"
+        "namespace " << prefix << "model {\n";
+
+  // Table of op user_data in op invocation order 
+  os << "void *precomputed_op_user_data[] = {\n";
+  for (auto &ud_ref_i : op_user_data_ ) {
+    os << "  &tflite::ops::micro::" << ud_ref_i.op_name << "::" << prefix << "op_user_data[" << ud_ref_i.user_data_idx << "],\n";
+  }
+  os << "};\n\n";
+
+  os << "} // namespace " << prefix << "model\n";
+
+  os << "} // namespace micro\n";
+  os << "} // namespace tflite\n";
+}
+
+size_t  CppInitializerCollector::initDataSize() const {
+
+  // Currently due to non-const clean-ness in tflite(u) 
+  // we are generate ALL OpData as initialized non-const data.
+  // Hence consumes value size in ROM AND RAM.  
+  size_t usage = 0;
+  for (auto &id_i : per_inst_user_data_) {
+    usage += id_i.second->value_size();
+  }
+  return usage;
+}
+
+
+size_t  CppInitializerCollector::uninitDataSize() const {
+  return 0;
+}
+
+size_t  CppInitializerCollector::constDataSize() const {
+  return 0;
+}
+
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+
+int CppInitializerCollector::recordScratchBuffer(ptrdiff_t offset_from_head) {
+    int globally_unique_buf_idx = static_cast<int>(scratch_buf_allocations_.size());
+    scratch_buf_allocations_.push_back(offset_from_head);
+    return globally_unique_buf_idx;
+}
+
+
+ptrdiff_t CppInitializerCollector::getRecordedScratchBufferStart(int globally_unique_buf_idx) {
+  if (globally_unique_buf_idx < 0 || static_cast<size_t>(globally_unique_buf_idx) >= scratch_buf_allocations_.size()) {
+    return static_cast<ptrdiff_t>(0xdeadbeef);
+  } else {
+    return scratch_buf_allocations_[globally_unique_buf_idx];
+  }
+}
+
+
+void CppInitializerCollector::writeRecordedScratchBufferAllocations(std::ostream &os)
+{
+  os << "namespace tflite {\n"
+     << "namespace micro {\n\n";
+
+  if (scratch_buf_allocations_.size() == 0) {
+      os << 
+R"(
+ptrdiff_t getRecordedScratchBufferStart(int buf_idx) {
+  return 0xdeadbeef;
+}
+)";
+  } else {
+    os << 
+R"(
+ptrdiff_t scratch_buffer_allocations[] = {
+)";
+    size_t offsets  = 0;
+    for (auto o : scratch_buf_allocations_) {
+      os << std::to_string(o) << ",";
+      ++offsets;
+      if (offsets % 10 == 0) {
+        os << "\n";
+      } else { 
+        os << " ";
+      }
+    }
+    os <<
+R"(
+};
+
+ptrdiff_t getRecordedScratchBufferStart(int globally_unique_buf_idx) {
+  const int num_sbuf_allocs = static_cast<int>(sizeof(scratch_buffer_allocations) / sizeof(ptrdiff_t));
+  if (globally_unique_buf_idx < 0 || globally_unique_buf_idx >= num_sbuf_allocs) {
+    return 0xdeadbeef;
+  } else {
+    return scratch_buffer_allocations[globally_unique_buf_idx];
+  }
+}
+
+)";
+  }
+  os << "} // namespace micro\n"
+        "} // namespace tflite\n";
+}
+#endif
+
+void CppPointerLiteral::cppInitializer(std::ostream &os,
+                                              const std::string &id_prefix) {
+  auto literal = CppInitializerCollector::instance().getLiteralForPointer(ptr_);
+  assert(!literal.empty()); 
+  os << literal;
+}
+
+//
+// Primary entry point for tflite(u) post-compiler...
+//
+
+void writeStaticOpDataHeaders(std::ostream &os) {
+  CppInitializerCollector::instance().writeStaticOpDataHeaders(os);
+}
+
+void writeStaticOpDataDefinitions(const std::string &prefix, std::ostream &os) {
+  CppInitializerCollector::instance().writeStaticOpDataDefinitions(prefix, os);
+}
+
+void recordStaticOpdata(const char *op_name, CppItems *op_data) {
+  CppInitializerCollector::instance().recordStaticOpdata(op_name, op_data);
+}
+
+void recordLiteralForPointer(const std::string &literal, void *ptr) {
+  CppInitializerCollector::instance().recordLiteralForPointer(ptr, literal);
+}
+
+size_t initDataUsage() {
+  return  CppInitializerCollector::instance().initDataSize();
+}
+
+size_t uninitDataUsage() {
+  return CppInitializerCollector::instance().uninitDataSize();
+}
+
+size_t constDataUsage() {
+  return CppInitializerCollector::instance().constDataSize();
+}
+
+
+DefineStaticOpDataHeaders::DefineStaticOpDataHeaders(
+    const char *op_name, const char *headers, const char *user_data_type) {
+  CppInitializerCollector::instance().recordOpDataHeaders(op_name, headers,
+                                                          user_data_type);
+}
+
+#endif
+
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+int recordScratchBuffer(ptrdiff_t offset_from_head) {
+  return CppInitializerCollector::instance().recordScratchBuffer(offset_from_head);
+}
+#endif
+
+
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+ptrdiff_t getRecordedScratchBufferStart(int globally_unique_buf_idx) {
+  return CppInitializerCollector::instance().getRecordedScratchBufferStart(globally_unique_buf_idx);
+}
+#endif
+
+
+#if TF_LITE_MICRO_AUTO_DUMPED_OPDATA
+
+// Provided by autorecord-ed generated op user_data code....
+
+namespace autorecord_model {
+extern void *precomputed_op_user_data[];
+} // namespace autorecord_mdoel
+
+void selectAutoDumpedOfflineOpUserData() {
+    resetOfflineOpUserData(autorecord_model::precomputed_op_user_data);
+}
+#endif
+
+
+}  // namespace micro
+}  // namespace tflite
diff --git a/tflite_u_preint/static_init_support.h b/tflite_u_preint/static_init_support.h
new file mode 100644
index 0000000..cc5a825
--- /dev/null
+++ b/tflite_u_preint/static_init_support.h
@@ -0,0 +1,528 @@
+/*
+ * static_init_support.h
+ *
+ *  Created on: 10.08.2020
+ *      Author: stevensa
+ */
+
+#ifndef TFLMCOMPILER_STATIC_INIT_SUPPORT_H_
+#define TFLMCOMPILER_STATIC_INIT_SUPPORT_H_
+
+
+
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+#include <string>
+#include <deque>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+
+namespace tflite {
+namespace micro {
+
+#if TF_LITE_MICRO_RECORD_OP_USER_DATA
+
+class BaseCollector;
+
+class CppNamedStruct;
+class CppPODStructInitializer;
+
+struct CppInitializerBase {
+  virtual void cppInitializer(std::ostream &os,
+                                     const std::string &id_prefix) = 0;
+  virtual void cppDefinition(std::ostream &os,
+                                      const std::string &id_prefix) = 0;
+
+  /**
+   * @brief Memory need to hold value of an item to initialize POD struct member
+   * 
+  * 
+   * @return size_t 
+   */
+  virtual size_t value_size() const { return 0; };
+
+    /**
+   * @brief Memory need to reference the value of the item as a struct menber
+   * 
+   * For items located inline this will be 0, for named itsemf the size of
+   * a pointer / ref to the item.
+   * 
+   * @return size_t 
+   */
+  virtual size_t ref_size() const { return 0; };
+
+  /**
+   * @brief Alignment constraint for value / ref  POD struct member initializer
+   * @return size_t 
+   */
+  virtual size_t align() const { return 1; };
+
+
+  /**
+   * @brief Compute aount of padding need to achieve alignment constraint
+   * 
+   * @param prev_item_end     End of previous item in assumed address space.                
+   * @param required_alignment  Required alignment in assumed address space
+   * @return size_t       Padding required to achive alignment
+   */
+  inline static size_t alignment_padding(size_t prev_item_end, size_t required_alignment) {
+    size_t misalign = prev_item_end%required_alignment;
+    return misalign != 0 ? required_alignment-misalign : 0;
+  }
+
+
+  virtual ~CppInitializerBase() {}
+
+};
+
+template <typename T>
+class CppPrimitiveInitializer : public CppInitializerBase {
+ public:
+  CppPrimitiveInitializer(const T val) : val_(val) {}
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << std::to_string(val_);
+  }
+
+  size_t value_size() const {
+    return sizeof(T);
+  }
+
+  size_t align() const {
+    return alignof(T);
+  }
+
+ protected:
+  T val_;
+};
+
+class CppNamedItemBase : virtual public CppInitializerBase {
+ protected:
+  CppNamedItemBase() {}
+
+ public:
+  CppNamedItemBase(const char *id) : id_(id) {}
+
+  const char *getId() const { return id_; }
+
+ protected:
+  const char *id_;
+};
+
+class CppInitializerReference : public CppNamedItemBase {
+ public:
+  CppInitializerReference(const char *id) : CppNamedItemBase(id) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << id_prefix << id_;
+  }
+
+    // A little dirty but fortunately exotica with varying pointer sizes
+  // not our worry...
+  size_t ref_size() const { return sizeof(int &); }
+  size_t align() const { return alignof(int &); }
+};
+
+
+class CppInitializerPointer : public CppNamedItemBase {
+ public:
+  CppInitializerPointer(const char *id) : CppNamedItemBase(id) {}
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << "&" << id_prefix << id_;
+  }
+
+  // A little dirty but fortunately exotica with varying pointer sizes
+  // not our worry...
+  size_t ref_size() const { return sizeof(int *); }
+  size_t align() const { return alignof(int *); }
+
+};
+
+
+class CppLiteral : public CppInitializerBase {
+ public:
+  CppLiteral(const char *literal) : literal_(literal) {}
+
+  CppLiteral(const std::string &literal) : literal_(literal) {}
+
+  CppLiteral(std::string &&literal)
+      : literal_(std::forward<std::string>(literal)) {}
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << literal_;
+  }
+  
+  size_t value_size() const { return sizeof(int *); }
+  size_t align() const { return sizeof(int *); }
+
+ protected:
+  std::string literal_;
+};
+
+
+class CppPointerLiteral : public CppInitializerBase {
+ public:
+  CppPointerLiteral(void *ptr) : ptr_(ptr) {}
+
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix);
+
+  // A little dirty but fortunately exotica with varying pointer sizes
+  // not our worry...
+  size_t value_size() const { return sizeof(void *); }
+  size_t align() const { return alignof(void *); }
+
+ protected:
+  void *ptr_;
+};
+
+
+class CppDefinitionBase : public CppNamedItemBase {
+ public:
+  CppDefinitionBase(const char *id, const char *type)
+      : CppNamedItemBase(id), type_(type) {}
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << id_prefix << id_;
+  }
+  const char *getType() const { return type_; }
+
+ protected:
+  const char *type_;
+};
+
+
+template <typename T>
+class CppNamedVec : public CppDefinitionBase {
+ public:
+  CppNamedVec(const char *id, const char *type, const T *data, size_t len)
+      : CppDefinitionBase(id, type)
+      , null_(data == nullptr) {
+    if (!null_) {
+      for (size_t i = 0; i < len; ++i) {
+        data_.push_back(data[i]);
+      }
+    }
+  }
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {
+    if (null_) {
+      os << "constexpr " << type_ << " *" << id_prefix << id_ << " = nullptr;\n";
+    } else {
+      os << type_ << " " << id_prefix << id_ << "[] = {\n";
+      for (size_t i = 0; i < data_.size(); ++i) {
+        os << data_[i] << ", ";
+      }
+      os << "\n};\n";
+    }
+  }
+
+
+  size_t value_size() const { return sizeof(T)*data_.size(); }
+  size_t ref_size() const { return sizeof(T *); }
+  size_t align() const { return alignof(T *); }
+ protected:
+  // We have copy data as (de)allocation before serialization is possible
+  std::vector<T> data_;
+  bool null_;
+};
+
+
+class CppItems  {
+ public:
+  CppItems() {}
+
+  template <typename T>
+  typename std::enable_if<std::is_integral<T>::value, CppItems &>::type
+  operator<<(T value) {
+    elements_.push_back(std::unique_ptr<CppInitializerBase>(
+        new CppPrimitiveInitializer<T>(value)));
+    return *this;
+  }
+
+  // For pointer to array: needs a named sub-initializer that has to be output first
+  template <typename T>
+  CppItems &operator<<(const CppNamedVec<T> &subvec);
+  
+  CppItems &operator<<(const char *literal);
+
+  CppItems &operator<<(float fvalue);
+
+
+  template <typename T>
+  typename std::enable_if<std::is_pointer<T>::value,
+                          CppItems &>::type
+  operator<<(T value);
+
+  // Pointer to structure: needs a named sub-initializer that has to be output first
+  CppItems &operator<<(const CppNamedStruct &structref);
+
+  // For sub-strucuture: an
+  CppItems &operator<<(const CppPODStructInitializer &substruct);
+
+  typedef std::deque<std::unique_ptr<CppDefinitionBase>> named_subinits_t;
+  typedef std::vector<std::unique_ptr<CppInitializerBase>> elements_t;
+
+
+  const elements_t &elements() const { return elements_; }
+
+  size_t value_size() const { 
+      size_t init_size = 0;
+      size_t values_size = 0;
+      for( auto &e : elements_) {
+        auto e_align = e->align();
+        auto padding = CppInitializerBase::alignment_padding(init_size, e_align);
+        init_size += padding + e->ref_size();
+        values_size += e->value_size();
+      }
+      // TODO: really we should allow for padding between values too!
+      return init_size+values_size;
+  }
+
+  size_t align() const {
+    if (elements_.empty()) {
+      return 1;
+    } else {
+      return elements_[0]->align();
+    }
+  }
+
+protected:
+  elements_t elements_;
+
+};  // namespace micro
+
+
+class CppPODStructInitializer : public CppInitializerBase {
+ public:
+  CppPODStructInitializer(CppItems *cppitems) 
+    : cppitems_(cppitems)
+  {
+  }
+
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {
+    for (auto &si : cppitems_->elements()) {
+      si->cppDefinition(os, id_prefix);
+    }
+  }
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << "{";
+    auto &elts = cppitems_->elements();
+    for (size_t i = 0; i < elts.size(); ++i) {
+      if (i > 0) {
+        os << ", ";
+      }
+      elts[i]->cppInitializer(os, id_prefix);
+    }
+    os << "}";
+  }
+
+  size_t value_size() const { 
+    return cppitems_->value_size();
+  }
+
+  size_t align() const {
+      return cppitems_->align();
+  }
+
+protected:
+
+  std::shared_ptr<CppItems> cppitems_;
+
+};  // namespace micro
+
+  /**
+   * @todo really, this should be named CppPtrToNamedStruct
+  */
+class CppNamedStruct : public CppDefinitionBase {
+ public:
+  CppNamedStruct(const char *id, const char *type, CppItems *cppitems)
+      : CppDefinitionBase(id, type)
+      , cppitems_(cppitems)
+    {}
+
+
+  void cppInitializer(std::ostream &os, const std::string &id_prefix) {
+    os << "&" << id_prefix << id_;
+  }
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {
+    std::string sub_prefix = id_prefix + id_ + "_";
+    cppitems_.cppDefinition(os, sub_prefix);
+    os << type_ << " " << id_prefix << id_ << " = \n";
+    cppitems_.cppInitializer(os, sub_prefix);
+    os << ";\n";
+  }
+
+  size_t ref_size() const { 
+      return sizeof(int *);
+  }
+
+  size_t value_size() const { 
+      return cppitems_.value_size();
+  }
+
+  size_t align() const {
+      return alignof(int *);
+  }
+
+
+protected:
+  CppPODStructInitializer cppitems_;
+};
+
+
+class CppNamedStructVecInitializer : public CppDefinitionBase {
+ public:
+  CppNamedStructVecInitializer(const char *id, const char *type)
+      : CppDefinitionBase(id, type) {}
+
+
+  void cppDefinition(std::ostream &os, const std::string &id_prefix) {
+    for (size_t i = 0; i < elts_.size(); ++i) {
+      std::string sub_prefix = id_prefix + id_ + std::to_string(i) + "_";
+      elts_[i]->cppDefinition(os, sub_prefix);
+    }
+    os << getType() << " " << id_prefix << id_ << "[] = {\n";
+    for (size_t i = 0; i < elts_.size(); ++i) {
+      os << "  ";
+      std::string sub_prefix = id_prefix + id_ + std::to_string(i) + "_";
+      elts_[i]->cppInitializer(os, sub_prefix);
+      if (i < elts_.size()-1) {
+        os << ", ";
+      }
+      os << "\n";
+    }
+    os << "};\n";
+  }
+
+  void pushBackElt(CppPODStructInitializer *elt) {
+    elts_.push_back(std::unique_ptr<CppPODStructInitializer>(elt));
+  }
+
+
+  size_t getSize() const { return elts_.size(); }
+
+  size_t ref_size() const { 
+      return sizeof(int *);
+  }
+
+  size_t value_size() const { 
+    if(elts_.empty()) {
+      return 0;
+    } else {
+      auto value_size = elts_[0]->value_size();
+      auto alignment = elts_[0]->align();
+      auto padding = alignment_padding(value_size, alignment);
+      auto aligned_size = value_size+padding;
+      return aligned_size*elts_.size();
+    }
+  }
+
+  size_t align() const {
+      return alignof(int *);
+  }
+
+ protected:
+  std::vector<std::unique_ptr<CppPODStructInitializer>> elts_;
+
+};  
+
+//
+// Implementation of CppItems stream ops
+// 
+
+template <typename T>
+CppItems &CppItems::operator<<(const CppNamedVec<T> &subvec) {
+  elements_.push_back(std::unique_ptr<CppInitializerBase>(
+      new CppNamedVec<T> (subvec))
+  );
+  return *this;
+}
+
+
+template <typename T>
+typename std::enable_if<std::is_pointer<T>::value,
+                        CppItems &>::type
+CppItems::operator<<(T value) {
+  elements_.push_back(std::unique_ptr<CppPointerLiteral>(
+      new CppPointerLiteral(reinterpret_cast<void *>(value))));
+  return *this;
+}
+
+
+//
+// Primary entry-points for tflite(u) post-compiler...
+//
+
+void writeStaticOpDataHeaders(std::ostream &os);
+
+void writeStaticOpDataDefinitions(const std::string &prefix, std::ostream &os);
+
+void recordStaticOpdata(const char *op_name, CppItems *op_data);
+
+void recordLiteralForPointer(const std::string &literal, void *ptr);
+
+size_t initDataUsage();
+
+size_t uninitDataUsage();
+
+size_t constDataUsage();
+
+class DefineStaticOpDataHeaders {
+ public:
+  DefineStaticOpDataHeaders(const char *op_name, const char *headers,
+                            const char *user_data_type);
+};
+#endif
+
+#if TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES 
+/**
+ * @brief Record new scratch buffers offset in tensor arena
+ * 
+ * @param offset_from_head  Tensor arena offset
+ * @return int  Globally unique index to identify this scratch buffer
+ */
+
+int recordScratchBuffer(ptrdiff_t offset_from_head);
+#endif
+
+#if TF_LITE_MICRO_AUTO_DUMPED_OPDATA || TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+/**
+ * @brief Get offset in tensor arena of start of specified (allocated) scratch buffer
+ * 
+ * @param globally_unique_buf_idx   (Globally unique buffer index from @c recordScratchBuffer)
+ * @return ptrdiff_t  Scratch buffer start as offset into tensor arena.
+ * 
+ */
+
+ptrdiff_t getRecordedScratchBufferStart(int globally_unique_buf_idx);
+#endif
+
+#if TF_LITE_MICRO_AUTO_DUMPED_OPDATA
+  void selectAutoDumpedOfflineOpUserData();
+#endif
+
+}  // namespace micro
+}  // namespace tflite
+
+
+
+#endif /* TFLMCOMPILER_STATIC_INIT_SUPPORT_H_ */

From 18c063a14d903cc824f9ad3a18649f83bada3b4d Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Mon, 13 Jun 2022 11:57:15 +0200
Subject: [PATCH 02/14] Fix cmake standalone build of tflite-micro lib

---
 CMakeLists.txt         |  1 +
 cmake/FindTFLite.cmake | 87 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b93a33..c5f9021 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,7 @@ TARGET_LINK_LIBRARIES(${PROJECT_NAME} PUBLIC ${TF_LIB})
 
 TARGET_COMPILE_DEFINITIONS(${PROJECT_NAME} PUBLIC
     TF_LITE_STATIC_MEMORY
+    TFLMC_USE_INTERPRETER_HOOKS
     TF_LITE_DISABLE_X86_NEON
     SUFFICIENT_ARENA_SIZE=128*1024*1024
 )
diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index c8389bc..f8fc3d2 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -87,15 +87,31 @@ ENDIF()
 SET(TFL_SRC ${TF_SRC}/tensorflow/lite)
 SET(TFLM_SRC ${TFL_SRC}/micro)
 SET(TFLD_SRC ${TFL_SRC}/tools/make/downloads)
+SET(TFLMD_SRC ${TFLM_SRC}/tools/make/downloads)
 
 IF(EXISTS ${TFLD_SRC}/flatbuffers/include)
     LIST(APPEND TFL_INC_DIRS ${TFLD_SRC}/flatbuffers/include)
+ELSEIF(EXISTS ${TFLMD_SRC}/flatbuffers/include)
+    LIST(APPEND TFL_INC_DIRS ${TFLMD_SRC}/flatbuffers/include)
 ENDIF()
 
 IF(EXISTS ${TFLD_SRC}/gemmlowp)
-    LIST(APPEND ${TFLD_SRC}/gemmlowp)
+    LIST(APPEND TFL_INC_DIRS ${TFLD_SRC}/gemmlowp)
+ELSEIF(EXISTS ${TFLMD_SRC}/gemmlowp)
+    LIST(APPEND TFL_INC_DIRS ${TFLMD_SRC}/gemmlowp)
 ENDIF()
 
+IF(EXISTS ${TFLD_SRC}/ruy)
+    LIST(APPEND TFL_INC_DIRS ${TFLD_SRC}/ruy)
+ELSEIF(EXISTS ${TFLMD_SRC}/ruy)
+    LIST(APPEND TFL_INC_DIRS ${TFLMD_SRC}/ruy)
+ENDIF()
+
+# SET(CUSTOM_QUANT_SRC ${TFL_SRC}/experimental/custom_quantization_util.cc)
+# IF(EXISTS ${CUSTOM_QUANT_SRC})
+#     SET(TFL_OPT_SRCS ${CUSTOM_QUANT_SRC})
+# ENDIF()
+
 LIST(APPEND TFL_INC_DIRS 
     ${TF_SRC}
     )
@@ -103,19 +119,55 @@ LIST(APPEND TFL_INC_DIRS
 FILE(GLOB TFL_ROOT_SRCS
     ${TFLM_SRC}/*.cc 
     )
+# schema_utils.cc only exists for newer TF versions
+IF(EXISTS ${TFL_SRC}/schema/schema_utils.cc)
+    LIST(APPEND TFL_ROOT_SRCS ${TFL_SRC}/schema/schema_utils.cc)
+ENDIF()
 
 FILE(GLOB TFL_KERNELS_SRCS
     ${TFLM_SRC}/kernels/*.cc 
     ${TFL_SRC}/kernels/internal/quantization_util.cc 
     ${TFL_SRC}/kernels/kernel_util.cc
+    ${TFLM_SRC}/kernels/kernel_util.cc
     )
 
+# These ones carry an unwanted dependecy (TODO: Fix)
+FILE(GLOB TFL_KERNELS_TO_REMOVE
+    # ${TFLM_SRC}/kernels/depth_to_space.cc
+    # ${TFLM_SRC}/kernels/space_to_depth.cc
+    # ${TFLM_SRC}/kernels/gather.cc
+    # ${TFLM_SRC}/kernels/transpose.cc
+    # ${TFLM_SRC}/kernels/floor_mod.cc
+    # ${TFLM_SRC}/kernels/floor_div.cc
+    )
+FOREACH(src ${TFL_KERNELS_TO_REMOVE})
+    LIST(FIND TFL_KERNELS_SRCS ${src} TFL_KERNELS_SRCS_FOUND_INDEX)
+    IF(${TFL_KERNELS_SRCS_FOUND_INDEX} GREATER_EQUAL 0)
+        LIST(REMOVE_ITEM TFL_KERNELS_SRCS ${src})
+    ENDIF()
+ENDFOREACH()
+
+IF(TFLM_EXTRA_KERNELS)
+    FILE(GLOB TFL_EXTRA_KERNEL_SRCS
+	    ${TFLM_SRC}/kernels/${TFLM_EXTRA_KERNELS}/*.cc
+        )
+    FOREACH(src ${TFL_EXTRA_KERNEL_SRCS})
+        GET_FILENAME_COMPONENT(src_name ${src} NAME)
+        SET(src_path "${TFLM_SRC}/kernels/${src_name}")
+        LIST(FIND TFL_KERNELS_SRCS ${src_path} TFL_KERNELS_SRCS_FOUND_INDEX)
+        IF(${TFL_KERNELS_SRCS_FOUND_INDEX} GREATER_EQUAL 0)
+            MESSAGE(STATUS "Replacing TFLM version of ${src_name} by ${TFLM_EXTRA_KERNELS} variant...")
+            LIST(REMOVE_ITEM TFL_KERNELS_SRCS ${src_path})
+        ENDIF()
+    ENDFOREACH()
+ENDIF()
+
 FILE(GLOB TFL_CORE_API_SRCS
     ${TFL_SRC}/core/api/*.cc 
     )
 
 FILE(GLOB TFL_C_SRCS
-    ${TFL_SRC}/c/common.c
+    ${TFL_SRC}/c/common.cc
     )
 
 FILE(GLOB TFL_MEM_PLANNER_SRCS
@@ -125,30 +177,61 @@ FILE(GLOB TFL_MEM_PLANNER_SRCS
 SET(TFL_SRCS 
     ${TFL_ROOT_SRCS}
     ${TFL_KERNELS_SRCS}
+    ${TFL_EXTRA_KERNEL_SRCS}
     ${TFL_CORE_API_SRCS}
     ${TFL_C_SRCS}
     ${TFL_MEM_PLANNER_SRCS}
+    ${TFL_OPT_SRCS}
     )
 
+MESSAGE(STATUS "TFL_SRCS=${TFL_SRCS}")
+
 LIST(FILTER TFL_SRCS EXCLUDE REGEX "([a-z0-9_]+_test.cc)$")
 
+IF(RECORD_STATIC_KERNELS)
+    LIST(APPEND TFL_INC_DIRS ${TFLITE_STATIC_INIT_PATH})
+    LIST(APPEND TFL_SRCS
+	 ${TFLITE_STATIC_INIT_PATH}/static_data_utils.cc
+	 ${TFLITE_STATIC_INIT_PATH}/static_init_support.cc
+    )
+ENDIF()
+
 ADD_LIBRARY(${LIB_NAME} STATIC
     ${TFL_SRCS}
 )
 
 TARGET_INCLUDE_DIRECTORIES(${LIB_NAME} PUBLIC
     ${TFL_INC_DIRS}
+    /home/philipp/src/tflmc/CMSIS_5/
+    /home/philipp/src/tflmc/CMSIS_5/CMSIS/Core/Include/
+    /home/philipp/src/tflmc/CMSIS_5/CMSIS/NN/Include/
+    /home/philipp/src/tflmc/CMSIS_5/CMSIS/DSP/Include/
 )
 
+TARGET_LINK_LIBRARIES(${LIB_NAME} PUBLIC /home/philipp/src/tflmc/CMSIS_5/CMSIS/NN/build/Source/libcmsis-nn.a)
+
 TARGET_COMPILE_DEFINITIONS(${LIB_NAME} PUBLIC
     TF_LITE_USE_GLOBAL_MAX
     TF_LITE_USE_GLOBAL_MIN
     TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS
     TF_LITE_STATIC_MEMORY
     TFLITE_EMULATE_FLOAT
+    CMSIS_NN
     "$<$<CONFIG:RELEASE>:TF_LITE_STRIP_ERROR_STRINGS>"
 )
 
+TARGET_COMPILE_DEFINITIONS(${LIB_NAME} PUBLIC
+    PREINTERPRETER
+)
+
+IF(RECORD_STATIC_KERNELS)
+    TARGET_COMPILE_DEFINITIONS(${LIB_NAME} PUBLIC
+        TF_LITE_MICRO_RECORD_STATIC_KERNEL_VARIANT
+        TF_LITE_MICRO_AUTO_DUMP_POINTER_TABLES
+        STATIC_INIT_OUT_FILE="${TF_SRC}/tensorflow/lite/micro/kernels/recorded_model/static_eval_tables.cc"
+    )
+ENDIF()
+
 SET(TFLite_INCLUDE_DIRS 
     ${TFL_INC_DIRS}
     )

From e3f16d2f1f2712bb04b8c4d0df804c03f8a38e4e Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 17:31:22 +0200
Subject: [PATCH 03/14] CMake: link arena_allocator sources

---
 cmake/FindTFLite.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index f8fc3d2..5991c4e 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -174,6 +174,10 @@ FILE(GLOB TFL_MEM_PLANNER_SRCS
     ${TFLM_SRC}/memory_planner/*.cc
     )
 
+FILE(GLOB TFL_ARENA_ALLOCATOR_SRCS
+    ${TFLM_SRC}/arena_allocator/*.cc
+    )
+
 SET(TFL_SRCS 
     ${TFL_ROOT_SRCS}
     ${TFL_KERNELS_SRCS}
@@ -181,6 +185,7 @@ SET(TFL_SRCS
     ${TFL_CORE_API_SRCS}
     ${TFL_C_SRCS}
     ${TFL_MEM_PLANNER_SRCS}
+    ${TFL_ARENA_ALLOCATOR_SRCS}
     ${TFL_OPT_SRCS}
     )
 

From b2d6b36ceab2000f0a1c3e0725a52001ce783482 Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 17:52:14 +0200
Subject: [PATCH 04/14] CMake: use standalone tflite repository

---
 cmake/FindTFLite.cmake | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 5991c4e..09a3079 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -12,7 +12,7 @@ IF(NOT TF_SRC)
         MESSAGE(STATUS "Getting TF tag '${TF_TAG}' and not master")
         FetchContent_Declare(
             tf 
-            GIT_REPOSITORY https://github.com/tensorflow/tensorflow.git
+            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
             GIT_TAG ${TF_TAG}
@@ -22,7 +22,7 @@ IF(NOT TF_SRC)
         MESSAGE(STATUS "Getting TF commit '${TF_COMMIT}' and not master")
         FetchContent_Declare(
             tf
-            GIT_REPOSITORY https://github.com/tensorflow/tensorflow.git
+            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
             GIT_TAG ${TF_COMMIT}
@@ -31,9 +31,10 @@ IF(NOT TF_SRC)
     ELSE()
         FetchContent_Declare(
             tf 
-            GIT_REPOSITORY https://github.com/tensorflow/tensorflow.git
+            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
+            GIT_TAG main
             QUIET
             )
     ENDIF()

From ac12e0acc517f9816f28c6e904d3c8181f103b15 Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 17:57:08 +0200
Subject: [PATCH 05/14] CMake: remove unused code

---
 cmake/FindTFLite.cmake | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 09a3079..e9e3fe0 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -132,22 +132,6 @@ FILE(GLOB TFL_KERNELS_SRCS
     ${TFLM_SRC}/kernels/kernel_util.cc
     )
 
-# These ones carry an unwanted dependecy (TODO: Fix)
-FILE(GLOB TFL_KERNELS_TO_REMOVE
-    # ${TFLM_SRC}/kernels/depth_to_space.cc
-    # ${TFLM_SRC}/kernels/space_to_depth.cc
-    # ${TFLM_SRC}/kernels/gather.cc
-    # ${TFLM_SRC}/kernels/transpose.cc
-    # ${TFLM_SRC}/kernels/floor_mod.cc
-    # ${TFLM_SRC}/kernels/floor_div.cc
-    )
-FOREACH(src ${TFL_KERNELS_TO_REMOVE})
-    LIST(FIND TFL_KERNELS_SRCS ${src} TFL_KERNELS_SRCS_FOUND_INDEX)
-    IF(${TFL_KERNELS_SRCS_FOUND_INDEX} GREATER_EQUAL 0)
-        LIST(REMOVE_ITEM TFL_KERNELS_SRCS ${src})
-    ENDIF()
-ENDFOREACH()
-
 IF(TFLM_EXTRA_KERNELS)
     FILE(GLOB TFL_EXTRA_KERNEL_SRCS
 	    ${TFLM_SRC}/kernels/${TFLM_EXTRA_KERNELS}/*.cc
@@ -190,7 +174,6 @@ SET(TFL_SRCS
     ${TFL_OPT_SRCS}
     )
 
-MESSAGE(STATUS "TFL_SRCS=${TFL_SRCS}")
 
 LIST(FILTER TFL_SRCS EXCLUDE REGEX "([a-z0-9_]+_test.cc)$")
 

From 727adae28c59a5eb83fb331414a3b5189160a367 Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 17:53:35 +0200
Subject: [PATCH 06/14] CMake: remove custom quant code

---
 cmake/FindTFLite.cmake | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index e9e3fe0..7a24eac 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -108,11 +108,6 @@ ELSEIF(EXISTS ${TFLMD_SRC}/ruy)
     LIST(APPEND TFL_INC_DIRS ${TFLMD_SRC}/ruy)
 ENDIF()
 
-# SET(CUSTOM_QUANT_SRC ${TFL_SRC}/experimental/custom_quantization_util.cc)
-# IF(EXISTS ${CUSTOM_QUANT_SRC})
-#     SET(TFL_OPT_SRCS ${CUSTOM_QUANT_SRC})
-# ENDIF()
-
 LIST(APPEND TFL_INC_DIRS 
     ${TF_SRC}
     )

From a33b847435f7530480916f5ba047494a0ba1df65 Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 17:57:17 +0200
Subject: [PATCH 07/14] CMake: do not hardcode cmsisnn

---
 cmake/FindTFLite.cmake | 51 +++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 7a24eac..c979b00 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -90,6 +90,23 @@ SET(TFLM_SRC ${TFL_SRC}/micro)
 SET(TFLD_SRC ${TFL_SRC}/tools/make/downloads)
 SET(TFLMD_SRC ${TFLM_SRC}/tools/make/downloads)
 
+
+IF(TFLM_OPTIMIZED_KERNEL)
+    # Suboptimal but we do not want to hardcode every kernel which should be replaced...
+    FILE(GLOB TFLM_EXTRA_KERNEL_SRCS ${TFLM_SRC}/kernels/${TFLM_OPTIMIZED_KERNEL}/*.cc)
+    # LIST(APPEND TFLM_EXTRA_KERNEL_INCS ${TFLM_SRC}/kernels/${TFLM_OPTIMIZED_KERNEL}/)
+    STRING(TOUPPER "${TFLM_OPTIMIZED_KERNEL}" TFLM_OPTIMIZED_KERNEL_UPPER)
+ENDIF()
+
+IF(TFLM_OPTIMIZED_KERNEL_LIB)
+    LIST(APPEND TFLM_EXTRA_KERNEL_LIBS ${TFLM_OPTIMIZED_KERNEL_LIB})
+ENDIF()
+
+IF(TFLM_OPTIMIZED_KERNEL_INCLUDE_DIR)
+    LIST(APPEND TFLM_EXTRA_KERNEL_INCS ${TFLM_OPTIMIZED_KERNEL_INCLUDE_DIR})
+ENDIF()
+
+
 IF(EXISTS ${TFLD_SRC}/flatbuffers/include)
     LIST(APPEND TFL_INC_DIRS ${TFLD_SRC}/flatbuffers/include)
 ELSEIF(EXISTS ${TFLMD_SRC}/flatbuffers/include)
@@ -127,20 +144,19 @@ FILE(GLOB TFL_KERNELS_SRCS
     ${TFLM_SRC}/kernels/kernel_util.cc
     )
 
-IF(TFLM_EXTRA_KERNELS)
-    FILE(GLOB TFL_EXTRA_KERNEL_SRCS
-	    ${TFLM_SRC}/kernels/${TFLM_EXTRA_KERNELS}/*.cc
-        )
-    FOREACH(src ${TFL_EXTRA_KERNEL_SRCS})
-        GET_FILENAME_COMPONENT(src_name ${src} NAME)
+FOREACH(src ${TFLM_EXTRA_KERNEL_SRCS})
+    GET_FILENAME_COMPONENT(src_name ${src} NAME)
+    IF(${src_name} MATCHES ".*_test.*")
+        LIST(REMOVE_ITEM TFLM_EXTRA_KERNEL_SRCS ${src})
+    ELSE()
         SET(src_path "${TFLM_SRC}/kernels/${src_name}")
-        LIST(FIND TFL_KERNELS_SRCS ${src_path} TFL_KERNELS_SRCS_FOUND_INDEX)
-        IF(${TFL_KERNELS_SRCS_FOUND_INDEX} GREATER_EQUAL 0)
-            MESSAGE(STATUS "Replacing TFLM version of ${src_name} by ${TFLM_EXTRA_KERNELS} variant...")
-            LIST(REMOVE_ITEM TFL_KERNELS_SRCS ${src_path})
+        LIST(FIND TFLM_REFERENCE_KERNEL_SRCS ${src_path} TFLM_KERNEL_SRCS_FOUND_INDEX)
+        IF(${TFLM_KERNEL_SRCS_FOUND_INDEX} GREATER_EQUAL 0)
+            MESSAGE(STATUS "Replacing TFLM version of ${src_name} by optimized variant...")
+            LIST(REMOVE_ITEM TFLM_REFERENCE_KERNEL_SRCS ${src_path})
         ENDIF()
-    ENDFOREACH()
-ENDIF()
+    ENDIF()
+ENDFOREACH()
 
 FILE(GLOB TFL_CORE_API_SRCS
     ${TFL_SRC}/core/api/*.cc 
@@ -186,13 +202,12 @@ ADD_LIBRARY(${LIB_NAME} STATIC
 
 TARGET_INCLUDE_DIRECTORIES(${LIB_NAME} PUBLIC
     ${TFL_INC_DIRS}
-    /home/philipp/src/tflmc/CMSIS_5/
-    /home/philipp/src/tflmc/CMSIS_5/CMSIS/Core/Include/
-    /home/philipp/src/tflmc/CMSIS_5/CMSIS/NN/Include/
-    /home/philipp/src/tflmc/CMSIS_5/CMSIS/DSP/Include/
+    ${TFLM_EXTRA_KERNEL_INCS}
 )
 
-TARGET_LINK_LIBRARIES(${LIB_NAME} PUBLIC /home/philipp/src/tflmc/CMSIS_5/CMSIS/NN/build/Source/libcmsis-nn.a)
+IF(TFLM_EXTRA_KERNEL_LIBS)
+    TARGET_LINK_LIBRARIES(${LIB_NAME} PUBLIC ${TFLM_EXTRA_KERNEL_LIBS})
+ENDIF()
 
 TARGET_COMPILE_DEFINITIONS(${LIB_NAME} PUBLIC
     TF_LITE_USE_GLOBAL_MAX
@@ -200,7 +215,7 @@ TARGET_COMPILE_DEFINITIONS(${LIB_NAME} PUBLIC
     TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS
     TF_LITE_STATIC_MEMORY
     TFLITE_EMULATE_FLOAT
-    CMSIS_NN
+    ${TFLM_OPTIMIZED_KERNEL_UPPER}
     "$<$<CONFIG:RELEASE>:TF_LITE_STRIP_ERROR_STRINGS>"
 )
 

From 30664f54a470fc593585e5097dfc73c87f4cd2ad Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 22 Jun 2022 18:00:13 +0200
Subject: [PATCH 08/14] CMake: make tf repo url customizable

---
 cmake/FindTFLite.cmake | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index c979b00..51b0f19 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -7,12 +7,18 @@ IF(TF_RECACHE)
 ENDIF()
 
 IF(NOT TF_SRC)
+
+    IF(TF_URL)
+        SET(TF_REPO ${TF_URL})
+    ELSE()
+        SET(TF_REPO https://github.com/tensorflow/tflite-micro.git)
+    ENDIF()
     INCLUDE(FetchContent)
     IF(TF_TAG)
         MESSAGE(STATUS "Getting TF tag '${TF_TAG}' and not master")
         FetchContent_Declare(
             tf 
-            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
+            GIT_REPOSITORY ${TF_REPO}
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
             GIT_TAG ${TF_TAG}
@@ -22,7 +28,7 @@ IF(NOT TF_SRC)
         MESSAGE(STATUS "Getting TF commit '${TF_COMMIT}' and not master")
         FetchContent_Declare(
             tf
-            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
+            GIT_REPOSITORY ${TF_REPO}
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
             GIT_TAG ${TF_COMMIT}
@@ -31,7 +37,7 @@ IF(NOT TF_SRC)
     ELSE()
         FetchContent_Declare(
             tf 
-            GIT_REPOSITORY https://github.com/tensorflow/tflite-micro.git
+            GIT_REPOSITORY ${TF_REPO}
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
             GIT_TAG main

From 38d0eff11e8e2d426f66ae25d128a3c0f569ff2c Mon Sep 17 00:00:00 2001
From: "Philipp v. K" <philipp.van-kempen@tum.de>
Date: Wed, 29 Jun 2022 10:44:43 +0200
Subject: [PATCH 09/14] CMake: use fixed flatbuffers commit

---
 cmake/FindTFLite.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 51b0f19..3751cb4 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -53,8 +53,9 @@ IF(NOT TF_SRC)
 
     FetchContent_Declare(
         flatbuffers 
-        GIT_REPOSITORY https://github.com/google/flatbuffers.git 
+        GIT_REPOSITORY https://github.com/google/flatbuffers.git
         GIT_PROGRESS FALSE 
+        GIT_TAG f28c2b29364970e23c8ba3d751ca188f8a08c71e
         QUIET
         )
     FetchContent_GetProperties(flatbuffers)

From 69c3f6c7f01682b09688ac2f83778cab479ac233 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 29 Jun 2022 12:34:00 +0200
Subject: [PATCH 10/14] Cmake: fixes

---
 cmake/FindTFLite.cmake | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 3751cb4..4c6e8cd 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -144,13 +144,19 @@ IF(EXISTS ${TFL_SRC}/schema/schema_utils.cc)
     LIST(APPEND TFL_ROOT_SRCS ${TFL_SRC}/schema/schema_utils.cc)
 ENDIF()
 
-FILE(GLOB TFL_KERNELS_SRCS
+FILE(GLOB TFLM_REFERENCE_KERNEL_SRCS
     ${TFLM_SRC}/kernels/*.cc 
     ${TFL_SRC}/kernels/internal/quantization_util.cc 
     ${TFL_SRC}/kernels/kernel_util.cc
     ${TFLM_SRC}/kernels/kernel_util.cc
     )
 
+
+# Remove broken kernel
+IF(EXISTS ${TFLM_SRC}/kernels/unidirectional_sequence_lstm.cc)
+    LIST(REMOVE_ITEM TFLM_REFERENCE_KERNEL_SRCS ${TFLM_SRC}/kernels/unidirectional_sequence_lstm.cc)
+ENDIF()
+
 FOREACH(src ${TFLM_EXTRA_KERNEL_SRCS})
     GET_FILENAME_COMPONENT(src_name ${src} NAME)
     IF(${src_name} MATCHES ".*_test.*")
@@ -183,7 +189,7 @@ FILE(GLOB TFL_ARENA_ALLOCATOR_SRCS
 
 SET(TFL_SRCS 
     ${TFL_ROOT_SRCS}
-    ${TFL_KERNELS_SRCS}
+    ${TFLM_REFERENCE_KERNEL_SRCS}
     ${TFL_EXTRA_KERNEL_SRCS}
     ${TFL_CORE_API_SRCS}
     ${TFL_C_SRCS}

From c97a127c3e03bb14698e7eb4619e70fc3df063e6 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 17 Aug 2022 12:02:15 +0200
Subject: [PATCH 11/14] CMake: lint

---
 cmake/FindTFLite.cmake | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/cmake/FindTFLite.cmake b/cmake/FindTFLite.cmake
index 4c6e8cd..31a345b 100644
--- a/cmake/FindTFLite.cmake
+++ b/cmake/FindTFLite.cmake
@@ -17,7 +17,7 @@ IF(NOT TF_SRC)
     IF(TF_TAG)
         MESSAGE(STATUS "Getting TF tag '${TF_TAG}' and not master")
         FetchContent_Declare(
-            tf 
+            tf
             GIT_REPOSITORY ${TF_REPO}
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
@@ -36,7 +36,7 @@ IF(NOT TF_SRC)
             )
     ELSE()
         FetchContent_Declare(
-            tf 
+            tf
             GIT_REPOSITORY ${TF_REPO}
             GIT_PROGRESS FALSE
             GIT_REMOTE_UPDATE_STRATEGY REBASE_CHECKOUT
@@ -52,9 +52,9 @@ IF(NOT TF_SRC)
     SET(TF_SRC ${tf_SOURCE_DIR})
 
     FetchContent_Declare(
-        flatbuffers 
+        flatbuffers
         GIT_REPOSITORY https://github.com/google/flatbuffers.git
-        GIT_PROGRESS FALSE 
+        GIT_PROGRESS FALSE
         GIT_TAG f28c2b29364970e23c8ba3d751ca188f8a08c71e
         QUIET
         )
@@ -66,10 +66,10 @@ IF(NOT TF_SRC)
     LIST(APPEND TFL_INC_DIRS ${flatbuffers_SOURCE_DIR}/include)
 
     FetchContent_Declare(
-        fixedpoint 
-        GIT_REPOSITORY https://github.com/google/gemmlowp.git 
-        GIT_PROGRESS FALSE 
-        QUIET 
+        fixedpoint
+        GIT_REPOSITORY https://github.com/google/gemmlowp.git
+        GIT_PROGRESS FALSE
+        QUIET
         )
     FetchContent_GetProperties(fixedpoint)
     IF(NOT fixedpoint_POPULATED)
@@ -79,10 +79,10 @@ IF(NOT TF_SRC)
     LIST(APPEND TFL_INC_DIRS ${fixedpoint_SOURCE_DIR})
 
     FetchContent_Declare(
-        ruy 
-        GIT_REPOSITORY https://github.com/google/ruy.git 
-        GIT_PROGRESS FALSE 
-        QUIET 
+        ruy
+        GIT_REPOSITORY https://github.com/google/ruy.git
+        GIT_PROGRESS FALSE
+        QUIET
         )
     FetchContent_GetProperties(ruy)
     IF(NOT ruy_POPULATED)
@@ -132,12 +132,12 @@ ELSEIF(EXISTS ${TFLMD_SRC}/ruy)
     LIST(APPEND TFL_INC_DIRS ${TFLMD_SRC}/ruy)
 ENDIF()
 
-LIST(APPEND TFL_INC_DIRS 
+LIST(APPEND TFL_INC_DIRS
     ${TF_SRC}
     )
 
 FILE(GLOB TFL_ROOT_SRCS
-    ${TFLM_SRC}/*.cc 
+    ${TFLM_SRC}/*.cc
     )
 # schema_utils.cc only exists for newer TF versions
 IF(EXISTS ${TFL_SRC}/schema/schema_utils.cc)
@@ -145,8 +145,8 @@ IF(EXISTS ${TFL_SRC}/schema/schema_utils.cc)
 ENDIF()
 
 FILE(GLOB TFLM_REFERENCE_KERNEL_SRCS
-    ${TFLM_SRC}/kernels/*.cc 
-    ${TFL_SRC}/kernels/internal/quantization_util.cc 
+    ${TFLM_SRC}/kernels/*.cc
+    ${TFL_SRC}/kernels/internal/quantization_util.cc
     ${TFL_SRC}/kernels/kernel_util.cc
     ${TFLM_SRC}/kernels/kernel_util.cc
     )
@@ -172,7 +172,7 @@ FOREACH(src ${TFLM_EXTRA_KERNEL_SRCS})
 ENDFOREACH()
 
 FILE(GLOB TFL_CORE_API_SRCS
-    ${TFL_SRC}/core/api/*.cc 
+    ${TFL_SRC}/core/api/*.cc
     )
 
 FILE(GLOB TFL_C_SRCS
@@ -187,10 +187,10 @@ FILE(GLOB TFL_ARENA_ALLOCATOR_SRCS
     ${TFLM_SRC}/arena_allocator/*.cc
     )
 
-SET(TFL_SRCS 
+SET(TFL_SRCS
     ${TFL_ROOT_SRCS}
     ${TFLM_REFERENCE_KERNEL_SRCS}
-    ${TFL_EXTRA_KERNEL_SRCS}
+    ${TFLM_EXTRA_KERNEL_SRCS}
     ${TFL_CORE_API_SRCS}
     ${TFL_C_SRCS}
     ${TFL_MEM_PLANNER_SRCS}
@@ -244,11 +244,11 @@ IF(RECORD_STATIC_KERNELS)
     )
 ENDIF()
 
-SET(TFLite_INCLUDE_DIRS 
+SET(TFLite_INCLUDE_DIRS
     ${TFL_INC_DIRS}
     )
 
-SET(TFLite_SOURCES 
+SET(TFLite_SOURCES
     ${TFL_SRCS}
     )
 

From a8d28531c04377012acd6ce34485a26f0e62bbd1 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Mon, 27 Jun 2022 14:44:07 +0200
Subject: [PATCH 12/14] include micro_mutable_op_resolver.h to get all required
 header files for the kernels

---
 src/Compiler.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Compiler.cc b/src/Compiler.cc
index 82a957a..b23d2fc 100644
--- a/src/Compiler.cc
+++ b/src/Compiler.cc
@@ -1100,6 +1100,7 @@ void tflmc::Compiler::writeSource(std::ostream &out) {
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_context.h"
 #if LOG_OP_INPUTS

From 8bd78e1d5c1e674035828ac3c0968fc3f0a96514 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 29 Jun 2022 12:34:22 +0200
Subject: [PATCH 13/14] add missing SUPPORT_CUSTOM_QUANT

---
 src/Compiler.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Compiler.cc b/src/Compiler.cc
index b23d2fc..8e3bf8a 100644
--- a/src/Compiler.cc
+++ b/src/Compiler.cc
@@ -912,11 +912,15 @@ void tflmc::Compiler::writeInitSource(CodeWriter &wr) {
       TfLiteAffineQuantization const* quant = ((TfLiteAffineQuantization const*)(tensorData[i].quantization.params));
       tflTensors[i].params.scale = quant->scale->data[0];
       tflTensors[i].params.zero_point = quant->zero_point->data[0];
-    } else if (tflTensors[i].quantization.type == kTfLitePackedAffineQuantization) {
+)";
+#if SUPPORT_CUSTOM_QUANT
+    wr << R"(    } else if (tflTensors[i].quantization.type == kTfLitePackedAffineQuantization) {
       TfLitePackedAffineQuantization const* quant = (TfLitePackedAffineQuantization const*)(tensorData[i].quantization.params);
       tflTensors[i].params.scale = quant->affine.scale->data[0];
       tflTensors[i].params.zero_point = quant->affine.zero_point->data[0];
-    }
+)";
+#endif  // SUPPORT_CUSTOM_QUANT
+    wr << R"(    }
 )";
   } else {
     wr << "    tflTensors[i].quantization.type = kTfLiteNoQuantization;\n";

From 5a3c29e2ec8e35d43d1e6d7945ad7b4457abafa4 Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 29 Jun 2022 12:35:10 +0200
Subject: [PATCH 14/14] turn off LOG_ARGC_ARGV

---
 src/main.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/main.cc b/src/main.cc
index 418ed7a..cfa7317 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -2,7 +2,9 @@
 #include "Compiler.h"
 #include "Options.h"
 
-#define LOG_ARGC_ARGV 1
+#ifndef LOG_ARGC_ARGV
+#define LOG_ARGC_ARGV 0
+#endif  // LOG_ARGC_ARGV
 
 
 /** The "real" main - physical main has workarounds for various