Skip to content

Commit

Permalink
next ollama runner (ollama#7913)
Browse files Browse the repository at this point in the history
feat: add new Ollama engine using ggml through cgo

This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this.

- `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go`
- `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go`
- `ml.Tensor` defines the interface for a tensor and tensor operations

This is the first implementation of the new engine. Follow up PRs will implement more features:

- non-greedy sampling (ollama#8410)
- integration with Ollama and KV caching (ollama#8301)
- more model support (ollama#9080) with more coming soon

Co-authored-by: Bruce MacDonald <[email protected]>
  • Loading branch information
mxyng and BruceMacD authored Feb 14, 2025
1 parent 8cf1606 commit 5824541
Show file tree
Hide file tree
Showing 57 changed files with 475,426 additions and 493 deletions.
63 changes: 63 additions & 0 deletions cache/cache.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package cache

import (
"github.com/ollama/ollama/ml"
)

type Options struct {
Position int
}

type Cache interface {
Sub(i int) Cache
Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
}

type Simple struct {
DType ml.DType
Capacity int

keys, values []ml.Tensor
}

func (c *Simple) Sub(i int) Cache {
if i >= len(c.keys) {
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
}

return &Simple{
keys: c.keys[i : i+1],
values: c.values[i : i+1],
Capacity: c.Capacity,
DType: c.DType,
}
}

func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
if c.keys[0] == nil || c.values[0] == nil {
c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
}

ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))

n := min(c.Capacity, int(key.Dim(2))+opts.Position)

key = c.keys[0].View(ctx, 0,
int(key.Dim(0)), int(key.Stride(1)),
int(key.Dim(1)), int(key.Stride(2)),
n,
)

value = c.values[0].View(ctx, 0,
int(value.Dim(0)), int(value.Stride(1)),
int(value.Dim(1)), int(value.Stride(2)),
n,
)

// TODO shift context if necessary

return key, value
}
32 changes: 16 additions & 16 deletions convert/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"log/slog"
"strings"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type ModelParameters struct {
Expand All @@ -27,8 +27,8 @@ type AdapterParameters struct {
} `json:"lora_parameters"`
}

func (ModelParameters) KV(t *Tokenizer) llm.KV {
kv := llm.KV{
func (ModelParameters) KV(t *Tokenizer) ggml.KV {
kv := ggml.KV{
"general.file_type": uint32(1),
"general.quantization_version": uint32(2),
"tokenizer.ggml.pre": t.Pre,
Expand All @@ -54,15 +54,15 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
return kv
}

func (p AdapterParameters) KV() llm.KV {
func (p AdapterParameters) KV() ggml.KV {
var alpha float32
if p.LoraParameters.Alpha == 0 {
alpha = float32(p.Alpha)
} else {
alpha = p.LoraParameters.Alpha
}

kv := llm.KV{
kv := ggml.KV{
"adapter.lora.alpha": alpha,
"adapter.type": "lora",
"general.file_type": uint32(1),
Expand All @@ -79,27 +79,27 @@ func (ModelParameters) specialTokenTypes() []string {
}
}

func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
return ggml.WriteGGUF(ws, kv, ts)
}

func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts)
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
return ggml.WriteGGUF(ws, kv, ts)
}

type ModelConverter interface {
// KV maps parameters to LLM key-values
KV(*Tokenizer) llm.KV
KV(*Tokenizer) ggml.KV
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
Tensors([]Tensor) []ggml.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string

// specialTokenTypes returns any special token types the model uses
specialTokenTypes() []string
// writeFile writes the model to the provided io.WriteSeeker
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
}

type moreParser interface {
Expand All @@ -108,17 +108,17 @@ type moreParser interface {

type AdapterConverter interface {
// KV maps parameters to LLM key-values
KV(llm.KV) llm.KV
KV(ggml.KV) ggml.KV
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
Tensors([]Tensor) []ggml.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string

writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
}

func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
bts, err := fs.ReadFile(fsys, "adapter_config.json")
if err != nil {
return err
Expand Down
10 changes: 5 additions & 5 deletions convert/convert_bert.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"slices"
"strings"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type bertModel struct {
Expand Down Expand Up @@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
return nil
}

func (p *bertModel) KV(t *Tokenizer) llm.KV {
func (p *bertModel) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "bert"
kv["bert.attention.causal"] = false
Expand Down Expand Up @@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
return kv
}

func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
if slices.Contains([]string{
"embeddings.position_ids",
Expand All @@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
continue
}

out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
Expand Down
10 changes: 5 additions & 5 deletions convert/convert_commandr.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package convert
import (
"cmp"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type commandrModel struct {
Expand All @@ -24,7 +24,7 @@ type commandrModel struct {

var _ ModelConverter = (*commandrModel)(nil)

func (p *commandrModel) KV(t *Tokenizer) llm.KV {
func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "command-r"
kv["general.name"] = "command-r"
Expand All @@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) llm.KV {
return kv
}

func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
Expand Down
10 changes: 5 additions & 5 deletions convert/convert_gemma.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type gemmaModel struct {
Expand All @@ -23,7 +23,7 @@ type gemmaModel struct {

var _ ModelConverter = (*gemmaModel)(nil)

func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma"
kv["gemma.context_length"] = p.MaxPositionEmbeddings
Expand All @@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
return kv
}

func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
if strings.HasSuffix(t.Name(), "_norm.weight") {
t.SetRepacker(p.addOne)
}

out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
Expand Down
6 changes: 2 additions & 4 deletions convert/convert_gemma2.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package convert

import (
"github.com/ollama/ollama/llm"
)
import "github.com/ollama/ollama/fs/ggml"

type gemma2Model struct {
gemmaModel
Expand All @@ -11,7 +9,7 @@ type gemma2Model struct {
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
}

func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma2"
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
Expand Down
10 changes: 5 additions & 5 deletions convert/convert_gemma2_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type gemma2Adapter struct {
Expand All @@ -15,14 +15,14 @@ type gemma2Adapter struct {

var _ AdapterConverter = (*gemma2Adapter)(nil)

func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "gemma2"
return kv
}

func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
Expand All @@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
t.SetRepacker(p.repack)
}

out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
Expand Down
12 changes: 6 additions & 6 deletions convert/convert_llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type llamaModel struct {
Expand Down Expand Up @@ -46,7 +46,7 @@ type llamaModel struct {

var _ ModelConverter = (*llamaModel)(nil)

func (p *llamaModel) KV(t *Tokenizer) llm.KV {
func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "llama"
kv["llama.vocab_size"] = p.VocabSize
Expand Down Expand Up @@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
return kv
}

func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor

if p.RopeScaling.factors != nil {
out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: "rope_freqs.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
Expand All @@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
t.SetRepacker(p.repack)
}

out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
Expand Down
10 changes: 5 additions & 5 deletions convert/convert_llama_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"

"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/fs/ggml"
)

type llamaAdapter struct {
Expand All @@ -18,7 +18,7 @@ type llamaAdapter struct {

var _ AdapterConverter = (*llamaAdapter)(nil)

func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "llama"
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
Expand All @@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
return kv
}

func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
var out []ggml.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
Expand All @@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
t.SetRepacker(p.repack)
}

out = append(out, llm.Tensor{
out = append(out, ggml.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: shape,
Expand Down
Loading

0 comments on commit 5824541

Please sign in to comment.