forked from stanford-oval/WikiChat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
167 lines (141 loc) · 6 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
include ./API_KEYS
include benchmark/benchmark.mk
### Pipeline parameters and their default values
engine ?= gpt-4
temperature ?= 0.0# only used for user-facing prompts. All other prompts use temperature 0.0 regardless of this value
pipeline ?= early_combine# one of: generate_and_correct, retrieve_and_generate, generate, retrieve_only, early_combine, atlas
evi_num ?= 2# num evi for each subclaim verification
retrieval_num ?= 3# for initial retrieval
reranking_method ?= date# none or date
skip_verification ?= false
fuse_claim_splitting ?= false
do_refine ?= true
refinement_prompt ?= refine_w_feedback.prompt
colbert_endpoint ?= http://127.0.0.1:5000/search
redis_port ?= 5003
debug_mode ?= false
PIPELINE_FLAGS = --pipeline $(pipeline) \
--engine $(engine) \
--claim_prompt_template_file split_claims.prompt \
--refinement_prompt $(refinement_prompt) \
--colbert_endpoint $(colbert_endpoint) \
--reranking_method $(reranking_method) \
--evi_num $(evi_num) \
--retrieval_num $(retrieval_num) \
--temperature $(temperature)
ifdef generate_engine
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --generate_engine $(generate_engine)
endif
ifdef draft_engine
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --draft_engine $(draft_engine)
endif
ifeq ($(do_refine), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --do_refine
endif
ifeq ($(skip_verification), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --skip_verification
endif
ifeq ($(fuse_claim_splitting), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --fuse_claim_splitting
endif
ifeq ($(debug_mode), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --debug_mode
endif
.PHONY: demo simulate-users start-colbert start-backend start-colbert-gunicorn start-backend-gunicorn download-and-index-wiki download-latest-wiki extract-wiki split-wiki index-wiki db-to-file run-tests
demo:
python chat_interactive.py \
$(PIPELINE_FLAGS) \
--output_file data/demo.txt \
start-colbert:
python colbert_app.py \
--colbert_index_path workdir/$(language)/wikipedia_$(wiki_date)/$(index_name) \
--colbert_checkpoint $(colbert_checkpoint) \
--colbert_collection_path $(collection) \
--memory_map "$(colbert_memory_map)"
# using more than 1 worker leads to loading the ColBERT index multiple times, which leads to running out of RAM
start-colbert-gunicorn:
gunicorn \
'colbert_app:gunicorn_app( \
colbert_index_path="workdir/$(language)/wikipedia_$(wiki_date)/$(index_name)", \
colbert_checkpoint="$(colbert_checkpoint)", \
colbert_collection_path="$(collection)", \
memory_map="$(colbert_memory_map)")' \
--access-logfile=- \
--bind 0.0.0.0:5000 \
--workers 1 \
--timeout 0
start-backend:
redis-server --port $(redis_port) --daemonize yes
python server_api.py \
$(PIPELINE_FLAGS) \
# --test
# --no_logging
# can only pass gunicorn built-in arguments; pipeline args uses the defaults
ssl_key_file ?= /path/to/privkey.pem
ssl_certificate_file ?= /path/to/fullchain.pem
start-backend-gunicorn:
redis-server --port $(redis_port) --daemonize yes
gunicorn \
'server_api:gunicorn_app( \
input_flags="$(PIPELINE_FLAGS)" \
)' \
--bind 0.0.0.0:5001 \
--workers 4 \
--keyfile $(ssl_key_file) \
--certfile $(ssl_certificate_file) \
--access-logfile=- \
--timeout 0
##### ColBERT indexing #####
nbits ?= 1# number of bits to encode each dimension with
max_block_words ?= 100# maximum number of words in each paragraph
doc_maxlen ?= 140# number of "tokens", to account for (100 "words" + title) that we include in each wikipedia paragraph
colbert_checkpoint ?= colbert-ir/colbertv2.0
split ?= all# '1m' passages or 'all'
experiment_name ?= wikipedia_$(split)
index_name ?= wikipedia.$(split).$(nbits)bits
wiki_date ?= 11_06_2023
language ?= en
collection ?= ./workdir/$(language)/wikipedia_$(wiki_date)/collection_$(split).tsv
nranks ?= 8# number of GPUs to use
colbert_memory_map ?= false
download-colbert-index:
python -c 'from huggingface_hub import snapshot_download; snapshot_download(repo_id="stanford-oval/wikipedia_colbert_index", repo_type="dataset", local_dir="workdir/", local_dir_use_symlinks=False, allow_patterns="*.tar.*")' ; \
cat workdir/$(language)/wikipedia_$(wiki_date).tar.* | tar xvf - -C ./workdir/$(language)/ ; \
rm workdir/$(language)/wikipedia_$(wiki_date).tar.*
# Takes ~70 minutes. using `axel -o` instead of `wget -O` can speed up the download
download-latest-wiki: workdir/
wget \
-O workdir/$(language)/wikipedia_$(wiki_date)/$(language)wiki-latest-pages-articles.xml.bz2 \
https://dumps.wikimedia.org/$(language)wiki/latest/$(language)wiki-latest-pages-articles.xml.bz2
# Takes ~5 hours. Will output a lot of template warnings, which is fine.
extract-wiki: workdir/$(language)/wikipedia_$(wiki_date)/$(language)wiki-latest-pages-articles.xml.bz2
python -m wikiextractor.wikiextractor.WikiExtractor workdir/$(language)/$(language)wiki-latest-pages-articles.xml.bz2 \
--templates workdir/$(language)/wiki_templates.txt \
--output workdir/$(language)/wikipedia_$(wiki_date)/text/ \
--links
# Takes ~1 minute for English, but more for other languages because we need to also find entity translations
split-wiki: workdir/$(language)/wikipedia_$(wiki_date)/text/
python wikiextractor/split_passages.py \
--input_path workdir/$(language)/text/ \
--output_path $(collection) \
--max_block_words $(max_block_words) \
--language $(language) \
--translation_cache workdir/translation_cache.json
# Takes 24 hours on a 40GB A100 GPU
index-wiki: $(collection)
python index_wiki.py \
--nbits $(nbits) \
--doc_maxlen $(doc_maxlen) \
--checkpoint $(colbert_checkpoint) \
--split $(split) \
--experiment_name $(experiment_name) \
--index_name $(index_name) \
--collection $(collection) \
--nranks $(nranks)
# Takes ~10 minutes
coalesce-index: experiments/$(experiment_name)/indexes/$(index_name)
python ColBERT/colbert/utils/coalesce.py \
--input experiments/$(experiment_name)/indexes/$(index_name) \
--output workdir/$(language)/wikipedia_$(wiki_date)/$(index_name)
tar-wiki-index:
tar -cvf colbert_wikipedia_index_$(language)_$(wiki_date).tar workdir/$(language)/wikipedia_$(wiki_date)/