From f8ae36a028305b011239385174d1f180b38a7d83 Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Sat, 27 Jan 2024 19:35:08 +0800 Subject: [PATCH 1/2] Create multi_turn_process.py --- dbgpt_hub/data_process/multi_turn_process.py | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 dbgpt_hub/data_process/multi_turn_process.py diff --git a/dbgpt_hub/data_process/multi_turn_process.py b/dbgpt_hub/data_process/multi_turn_process.py new file mode 100644 index 0000000..9e53247 --- /dev/null +++ b/dbgpt_hub/data_process/multi_turn_process.py @@ -0,0 +1,44 @@ +import json + + +def process_data(input_file_path): + # 读取原始数据 + with open(input_file_path, "r") as file: + original_data = json.load(file) + + # 新格式的数据列表 + formatted_data = [] + + # 遍历原始数据 + for entry in original_data: + merged_entry = [] + instruction = entry["instruction"] + entry["input"] + history = entry["history"] + + # 合并指令和历史记录 + merged_entry.append(instruction) + merged_entry.append(entry["output"]) + + # 添加历史记录 + for pair in history[1:]: + for item in pair: + merged_entry.append(item) + + # 添加布尔值列表 + boolean_flags = [True, False] * len(history) + formatted_entry = [merged_entry, boolean_flags] + formatted_data.append(formatted_entry) + + # 将转换后的数据写入文件 + with open(input_file_path, "w") as file: + json.dump(formatted_data, file, indent=4) + + print(f"数据已成功转换并写入到文件:{input_file_path}") + + +# 指定输入和输出文件路径 +train_file_path = "./dbgpt_hub/data/example_text2sql_train.json" +dev_file_path = "./dbgpt_hub/data/example_text2sql_dev.json" +# 处理数据 +process_data(train_file_path) +process_data(dev_file_path) From 37d249fb5cd408b2bbbc8c011a72e4d8d82b6b4e Mon Sep 17 00:00:00 2001 From: luchun <71970539+zhanghy-sketchzh@users.noreply.github.com> Date: Sat, 27 Jan 2024 19:35:59 +0800 Subject: [PATCH 2/2] Update config.py --- dbgpt_hub/configs/config.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dbgpt_hub/configs/config.py b/dbgpt_hub/configs/config.py index 268d6d6..e271195 100644 --- a/dbgpt_hub/configs/config.py +++ b/dbgpt_hub/configs/config.py @@ -86,12 +86,15 @@ # } # , # { + # { # "data_source": "sparc", # "train_file": ["train.json"], + # "train_tables_file": "tables.json", + # "dev_tables_file": "tables.json", # "dev_file": ["dev.json"], - # "tables_file": "tables.json", # "db_id_name": "database_id", # "is_multiple_turn": True, + # "output_name": "query", # } ] INSTRUCTION_PROMPT = """\