From 41fe76b1ce1e82fbb1f8c50545a0ff0a39db1df7 Mon Sep 17 00:00:00 2001 From: "Charles O. Goddard" Date: Wed, 13 Mar 2024 21:33:12 -0700 Subject: [PATCH] Add support for GPTBigCodeForCausalLM (#195) But with a JSON architecture definition this time. --- .../gpt2-sequence-classification.json | 6 -- mergekit/_data/architectures/gpt2.json | 6 -- mergekit/_data/architectures/gptbigcode.json | 70 +++++++++++++++++++ 3 files changed, 70 insertions(+), 12 deletions(-) create mode 100644 mergekit/_data/architectures/gptbigcode.json diff --git a/mergekit/_data/architectures/gpt2-sequence-classification.json b/mergekit/_data/architectures/gpt2-sequence-classification.json index 7e89ca65..54cf31f6 100644 --- a/mergekit/_data/architectures/gpt2-sequence-classification.json +++ b/mergekit/_data/architectures/gpt2-sequence-classification.json @@ -60,12 +60,6 @@ }, { "name": "transformer.h.${layer_index}.mlp.c_fc.bias" - }, - { - "name": "transformer.h.${layer_index}.mlp.c_proj.weight" - }, - { - "name": "transformer.h.${layer_index}.mlp.c_proj.bias" } ] } diff --git a/mergekit/_data/architectures/gpt2.json b/mergekit/_data/architectures/gpt2.json index 8d151df3..64a04e9d 100644 --- a/mergekit/_data/architectures/gpt2.json +++ b/mergekit/_data/architectures/gpt2.json @@ -58,12 +58,6 @@ }, { "name": "h.${layer_index}.mlp.c_fc.bias" - }, - { - "name": "h.${layer_index}.mlp.c_proj.weight" - }, - { - "name": "h.${layer_index}.mlp.c_proj.bias" } ] } diff --git a/mergekit/_data/architectures/gptbigcode.json b/mergekit/_data/architectures/gptbigcode.json new file mode 100644 index 00000000..4b086278 --- /dev/null +++ b/mergekit/_data/architectures/gptbigcode.json @@ -0,0 +1,70 @@ +{ + "model_type": "gpt_bigcode", + "architectures": [ + "GPTBigCodeForCausalLM" + ], + "pre_weights": [ + { + "name": "transformer.wte.weight", + "is_embed": true + }, + { + "name": "transformer.wpe.weight" + } + ], + "post_weights": [ + { + "name": "transformer.ln_f.weight" + }, + { + "name": "transformer.ln_f.bias" + }, + { + "name": "lm_head.weight", + "aliases": [ + "transformer.wte.weight" + ] + } + ], + "num_layers_config_key": "n_layer", + "layer_templates": { + "weights": [ + { + "name": "transformer.h.${layer_index}.attn.c_attn.weight" + }, + { + "name": "transformer.h.${layer_index}.attn.c_attn.bias" + }, + { + "name": "transformer.h.${layer_index}.attn.c_proj.weight" + }, + { + "name": "transformer.h.${layer_index}.attn.c_proj.bias" + }, + { + "name": "transformer.h.${layer_index}.ln_1.weight" + }, + { + "name": "transformer.h.${layer_index}.ln_1.bias" + }, + { + "name": "transformer.h.${layer_index}.ln_2.weight" + }, + { + "name": "transformer.h.${layer_index}.ln_2.bias" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_proj.weight" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_proj.bias" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_fc.weight" + }, + { + "name": "transformer.h.${layer_index}.mlp.c_fc.bias" + } + ] + } +}