From 73870047b64a36950360c5671b3ee3b707537846 Mon Sep 17 00:00:00 2001 From: Xuanyu Zhou Date: Sun, 28 Oct 2018 11:29:27 -0500 Subject: [PATCH 1/3] fix install process --- README.md | 5 +++++ install.sh | 27 +++++++++++++++++++++++++++ requirements.txt | 1 - server.py | 6 +++++- zoe_utils.py | 3 +++ 5 files changed, 40 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 08d442e..a111478 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ To this end, we are working on an online demo, and we plan to release it before * Minimum 16G available disk space and 16G memory. (Lower specs will not work) * Python 3.X (Mostly tested on 3.5) * A POSIX OS (Windows not tested) +* Java JDK and Maven * `virtualenv` if you are installing with script (check if `virtualenv` command works) * `wget` if you are installing with script (Use brew to install it on OSX) * `unzip` if you are installing with script @@ -60,6 +61,10 @@ Currently you can do the following: * Run experiment on BBN test set: `python3 main.py bbn` * Run experiment on the first 1000 Ontonotes_fine test set instances (due to size issue): `python3 main.py ontonotes` +Additionally, you can run server mode that initializes an online demo with `python3 server.py` +However, this requires some additional file that's not provided for download yet. +Please directly contact the authors. + It's generally an expensive operation to run on new sentences, but you can still do it. Please refer to `main.py` to see how you can test on your own data. diff --git a/install.sh b/install.sh index fc25240..8f8ec06 100644 --- a/install.sh +++ b/install.sh @@ -1,5 +1,31 @@ #!/bin/bash +if ! [ -x "$(command -v java)" ]; then + echo 'Error: Java in not installed.' + exit 1 +fi +if ! [ -x "$(command -v mvn)" ]; then + echo 'Error: maven is not installed.' + exit 1 +fi +if ! [ -x "$(command -v python3)" ]; then + echo 'Error: python 3.x is not installed.' + exit 1 +fi +if ! [ -x "$(command -v virtualenv)" ]; then + echo 'Error: virtualenv is not installed.' + exit 1 +fi +if ! [ -x "$(command -v wget)" ]; then + echo 'Error: wget is not found. Either install or find replacement and modify this script.' + exit 1 +fi +if ! [ -x "$(command -v unzip)" ]; then + echo 'Error: unzip is not found. Either install or find replacement and modify this script.' + exit 1 +fi +echo 'All dependencies satisfied. Moving on...' + virtualenv -p python3 venv cd ./bilm-tf ../venv/bin/python3 setup.py install @@ -7,6 +33,7 @@ wget http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/model.zip unzip model.zip rm model.zip cd ../ +venv/bin/pip3 install Cython venv/bin/pip3 install -r requirements.txt wget http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/data.zip unzip -n data.zip diff --git a/requirements.txt b/requirements.txt index 4df8f04..184112f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,5 @@ scipy regex Flask flask-cors -cython ccg_nlpy gensim \ No newline at end of file diff --git a/server.py b/server.py index bbb9d45..52f0b93 100644 --- a/server.py +++ b/server.py @@ -30,7 +30,11 @@ def __init__(self, sql_db_path, surface_cache_path): self.pipeline = local_pipeline.LocalPipeline() self.pipeline_initialize_helper(['.']) self.runner = ZoeRunner(allow_tensorflow=True) - self.runner.elmo_processor.load_sqlite_db(sql_db_path, server_mode=True) + status = self.runner.elmo_processor.load_sqlite_db(sql_db_path, server_mode=True) + if not status: + print("ELMo cache file is not found. Server mode is prohibited without it.") + print("Please contact the author for this cache, or modify this code if you know what you are doing.") + exit(1) self.runner.elmo_processor.rank_candidates_vec() signal.signal(signal.SIGINT, self.grace_end) diff --git a/zoe_utils.py b/zoe_utils.py index c74efbc..86545b9 100644 --- a/zoe_utils.py +++ b/zoe_utils.py @@ -37,10 +37,13 @@ def __init__(self, allow_tensorflow): self.word2vec = None def load_sqlite_db(self, path, server_mode=False): + if not os.path.isfile(path): + return False self.db_conn = sqlite3.connect(path) self.db_path = path self.server_mode = server_mode self.db_loaded = True + return True def query_sqlite_db(self, candidates): if not self.db_loaded: From d9c5654c29dc4bc10aa6f0d387fd1cc00222040e Mon Sep 17 00:00:00 2001 From: Xuanyu Zhou Date: Sun, 28 Oct 2018 11:54:28 -0500 Subject: [PATCH 2/3] shorten README --- README.md | 101 +++++++++++------------------------------------------- 1 file changed, 20 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index a111478..2f8e3c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ZOE (Zero-shot Open Typing) +# ZOE (Zero-shot Open Entity Typing) A state of the art system for zero-shot entity fine typing with minimum supervision ## Introduction @@ -7,17 +7,17 @@ This is a demo system for our paper "Zero-Shot Open Entity Typing as Type-Compat which at the time of publication represents the state-of-the-art of zero-shot entity typing. The original experiments that produced all the results in the paper -are done with a package written in Java. This is a re-written package -that contains the same core, without experimental code. It's solely for +are done with a package written in Java. This is a re-written package solely for the purpose of demoing the algorithm and validating key results. -The results may slightly differ from published numbers, due to the randomness in Java's -HashSet and Python set's iteration order. The difference should be within 0.5%. +The results may be slightly different with published numbers, due to the randomness in Java's +HashSet and Python set's iteration order. The difference should be negligible. -A major flaw of this system is the speed of running new sentences, due to ELMo processing. -We have cached ELMo results for the provided experiments to make running experiments possible. +This system may take a long time if ran on a large number of new sentences, due to ELMo processing. +We have cached ELMo results for the provided experiments. -To this end, we are working on an online demo, and we plan to release it before EMNLP 2018. +The package also contains an online demo, please refer to [Publication Page](http://cogcomp.org/page/publication_view/845) +for more details. ## Usage @@ -25,100 +25,39 @@ To this end, we are working on an online demo, and we plan to release it before #### Prerequisites -* Minimum 16G available disk space and 16G memory. (Lower specs will not work) +* Minimum 20G available disk space and 16G memory. (strict requirement) * Python 3.X (Mostly tested on 3.5) -* A POSIX OS (Windows not tested) +* A POSIX OS (Windows not supported) * Java JDK and Maven -* `virtualenv` if you are installing with script (check if `virtualenv` command works) +* `virtualenv` if you are installing with script * `wget` if you are installing with script (Use brew to install it on OSX) * `unzip` if you are installing with script -#### Install using a shell script +#### Install using a one-line command -To make everyone's life easier, we have provided a simple way for install, simply run `sh install.sh`. +To make life easier, we provide a simple way to install with `sh install.sh`. This script does everything mentioned in the next section, plus creating a virtualenv. Use `source venv/bin/activate` to activate. #### Install manually -Generally it's recommended to create a Python3 virtualenv and work under it. - -You need to first install AllenAI's bilm-tf package by running `python3 setup.py install` in ./bilm-tf directory - -Then install requirements by `pip3 install -r requirements.txt` in project root - -Then you need to download all the data/model files. There are two steps in this: -* in bilm-tf/, download [model.zip](http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/model.zip), and uncompress -* project root, download [data.zip](http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/data.zip), and uncompress - -Then check if all files are here by `python3 scripts.py CHECKFILES` or `python3 scripts.py CHECKFILES figer` -in order to check figer caches etc. +See wiki [manual-installation](https://github.com/CogComp/zoe/wiki/Manual-Installation) ### Run the system -Currently you can do the following: +Currently you can do the following without changes to the code: * Run experiment on FIGER test set (randomly sampled as the paper): `python3 main.py figer` * Run experiment on BBN test set: `python3 main.py bbn` * Run experiment on the first 1000 Ontonotes_fine test set instances (due to size issue): `python3 main.py ontonotes` -Additionally, you can run server mode that initializes an online demo with `python3 server.py` -However, this requires some additional file that's not provided for download yet. +Additionally, you can run server mode that initializes the online demo with `python3 server.py` +However, this requires some additional files that's not provided for download yet. Please directly contact the authors. -It's generally an expensive operation to run on new sentences, but you can still do it. -Please refer to `main.py` to see how you can test on your own data. - -## Engineering details - -### Structure - -The package is composed with - -* A slightly modified ELMo source code, see [bilm-tf](https://github.com/allenai/bilm-tf) -* A main library `zoe_utils.py` -* A executor `main.py` -* A script helper `script.py` - -### zoe_utils.py - -This is the main library file which contains the core logic. - -It has 4 main component Classes: - -#### `EsaProcessor` - -Supports all operations related to ESA and its data files. - -A main entrance is `EsaProcessor.get_candidates` which given a sentence, returns -the top `EsaProcessor.RETURN_NUM` candidate Wikipedia concepts - -#### `ElmoProcessor` - -Supports all operations related to ElMo and its data files. - -A main entrance is `ElmoProcessor.rank_candidates`, which given a sentence and a list -of candidates (generated from ESA), rank them by ELMo representation cosine similarities. (see paper) - -It will return the top `ElmoProcessor.RANKED_RETURN_NUM` candidates. - -#### `InferenceProcessor` - -This is the core engine that does inference given outputs from the previous processors. - -The logic behind it is as described in the paper and is rather complicated. - -One main entrance is `InferenceProcessor.inference` which receives a sentence, outputs from -previously mentioned processors, and set inference results. - -#### `Evaluator` - -This evaluates performances and print them, after given a list of sentences processed by -`InferenceProcessor` - -#### `DataReader` +It's generally an expensive operation to run on large numerb of new sentences, but you are welcome to do it. +Please refer to `main.py` and [Engineering Details](https://github.com/CogComp/zoe/wiki/Engineering-Details) +to see how you can test on your own data. -Initialize this with a data file path. It reads standard json formats (see examples) -and transform the data into a list of `Sentence` ## Citation See the following paper: From a9f4a027806b981976cd12e8368d52ea8fd40295 Mon Sep 17 00:00:00 2001 From: Xuanyu Zhou Date: Sun, 28 Oct 2018 16:11:43 -0500 Subject: [PATCH 3/3] tokenizer + several bug fixes --- frontend/index.html | 38 +++++++++++++++--- server.py | 93 +++++++++++++++++++++------------------------ zoe_utils.py | 2 +- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/frontend/index.html b/frontend/index.html index 594acb2..96536bd 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -218,7 +218,23 @@ alert("You must enter a sentence to proceed."); return; } - var tokens = sentence.trim().split(" "); + let xhr = new XMLHttpRequest(); + xhr.open("POST", SERVER_API + "annotate_token", true); + xhr.setRequestHeader("Content-Type", "application/json"); + xhr.onreadystatechange = function () { + if (xhr.readyState === XMLHttpRequest.DONE && xhr.status === 200) { + var json = JSON.parse(xhr.responseText); + continueGenerateTokens(json); + } + }; + var data = JSON.stringify({ + sentence: sentence, + }); + xhr.send(data); + } + + function continueGenerateTokens(result) { + var tokens = result["tokens"]; document.getElementById("total-token-num").innerText = String(tokens.length); for (var i = 0; i < tokens.length; i++) { var curToken = tokens[i]; @@ -262,6 +278,16 @@ document.getElementById("using-preset-example").innerText = String(-1); } + function getTokens() { + var parent_div = document.getElementById("token-display"); + var i; + var tokens = []; + for (i = 0; i < parent_div.children.length; i++) { + tokens.push(parent_div.children[i].innerHTML); + } + return tokens; + } + function generatePresetMentions() { var sentence = document.getElementById("sentence-input").value; var xhr = new XMLHttpRequest(); @@ -274,7 +300,7 @@ } }; var data = JSON.stringify({ - tokens: sentence.trim().split(" "), + tokens: getTokens(), }); xhr.send(data); } @@ -504,7 +530,7 @@ }; var data_vec = JSON.stringify({ index: i, - tokens: sentence.trim().split(" "), + tokens: getTokens(), mention_starts: [mention_starts[i]], mention_ends: [mention_ends[i]], }); @@ -521,7 +547,7 @@ }; var data_simple = JSON.stringify({ index: i, - tokens: sentence.trim().split(" "), + tokens: getTokens(), mention_starts: [mention_starts[i]], mention_ends: [mention_ends[i]], }); @@ -538,7 +564,7 @@ }; var data = JSON.stringify({ index: i, - tokens: sentence.trim().split(" "), + tokens: getTokens(), mention_starts: [mention_starts[i]], mention_ends: [mention_ends[i]], mode: getInferenceMode(), @@ -634,7 +660,7 @@ function getExampleSentenceMention(id) { if (id == 1) { - return [[0, 2], [10, 12], [15, 17]]; + return [[0, 2], [11, 13], [16, 18]]; } if (id == 2) { return [[0, 1], [5, 7], [9, 11], [20, 21]]; diff --git a/server.py b/server.py index 52f0b93..695d761 100644 --- a/server.py +++ b/server.py @@ -112,53 +112,31 @@ def handle_input(self): if mode != "figer": if mode != "custom": selected_inference_processor = InferenceProcessor(mode, resource_loader=self.runner.inference_processor) - for sentence in sentences: - sentence.set_signature(selected_inference_processor.signature()) - cached = self.mem_cache.query_cache(sentence) - if cached is not None: - sentence = cached - else: - self.runner.process_sentence(sentence, selected_inference_processor) - self.mem_cache.insert_cache(sentence) - self.surface_cache.insert_cache(sentence) - predicted_types.append(list(sentence.predicted_types)) - predicted_candidates.append(sentence.elmo_candidate_titles) - mentions.append(sentence.get_mention_surface_raw()) - selected_candidates.append(sentence.selected_title) - other_possible_types.append(sentence.could_also_be_types) else: rules = r["taxonomy"] mappings = self.parse_custom_rules(rules) - custom_inference_processor = InferenceProcessor(mode, custom_mapping=mappings) - for sentence in sentences: - sentence.set_signature(custom_inference_processor.signature()) - cached = self.mem_cache.query_cache(sentence) - if cached is not None: - sentence = cached - else: - self.runner.process_sentence(sentence, custom_inference_processor) - self.mem_cache.insert_cache(sentence) - self.surface_cache.insert_cache(sentence) - predicted_types.append(list(sentence.predicted_types)) - predicted_candidates.append(sentence.elmo_candidate_titles) - mentions.append(sentence.get_mention_surface_raw()) - selected_candidates.append(sentence.selected_title) - other_possible_types.append(sentence.could_also_be_types) + selected_inference_processor = InferenceProcessor(mode, custom_mapping=mappings) else: - for sentence in sentences: - sentence.set_signature(self.runner.inference_processor.signature()) - cached = self.mem_cache.query_cache(sentence) - if cached is not None: - sentence = cached - else: - self.runner.process_sentence(sentence) + selected_inference_processor = self.runner.inference_processor + + for sentence in sentences: + sentence.set_signature(selected_inference_processor.signature()) + cached = self.mem_cache.query_cache(sentence) + if cached is not None: + sentence = cached + else: + self.runner.process_sentence(sentence, selected_inference_processor) + try: self.mem_cache.insert_cache(sentence) self.surface_cache.insert_cache(sentence) - predicted_types.append(list(sentence.predicted_types)) - predicted_candidates.append(sentence.elmo_candidate_titles) - mentions.append(sentence.get_mention_surface_raw()) - selected_candidates.append(sentence.selected_title) - other_possible_types.append(sentence.could_also_be_types) + except: + print("Cache insertion exception. Ignored.") + predicted_types.append(list(sentence.predicted_types)) + predicted_candidates.append(sentence.elmo_candidate_titles) + mentions.append(sentence.get_mention_surface_raw()) + selected_candidates.append(sentence.selected_title) + other_possible_types.append(sentence.could_also_be_types) + elapsed_time = time.time() - start_time print("Processed mention " + str([x.get_mention_surface() for x in sentences]) + " in mode " + mode + ". TIME: " + str(elapsed_time) + " seconds.") ret["type"] = predicted_types @@ -176,6 +154,17 @@ def pipeline_initialize_helper(self, tokens): doc.get_ner_ontonotes doc.get_view("MENTION") + def handle_tokenizer_input(self): + r = request.get_json() + ret = {"tokens": []} + if "sentence" not in r: + return json.dumps(ret) + doc = self.pipeline.doc(r["sentence"]) + token_view = doc.get_tokens + for cons in token_view: + ret["tokens"].append(str(cons)) + return json.dumps(ret) + """ Handles requests for mention filling """ @@ -209,12 +198,12 @@ def handle_mention_input(self): for cons in additions_view: add_to_list = True if additions_view.view_name != "MENTION": - start = cons['start'] - end = cons['end'] + start = int(cons['start']) + end = int(cons['end']) else: - start = cons['properties']['EntityHeadStartSpan'] - end = cons['properties']['EntityHeadEndSpan'] - for i in range(start - 1, end + 1): + start = int(cons['properties']['EntityHeadStartSpan']) + end = int(cons['properties']['EntityHeadEndSpan']) + for i in range(max(start - 1, 0), min(len(tokens), end + 1)): if i in ret_set: add_to_list = False break @@ -246,10 +235,13 @@ def handle_simple_input(self): for sentence in sentences: surface = sentence.get_mention_surface() cached_types = self.surface_cache.query_cache(surface) - distinct = set() - for t in cached_types: - distinct.add("/" + t.split("/")[1]) - types.append(list(distinct)) + if cached_types is not None: + distinct = set() + for t in cached_types: + distinct.add("/" + t.split("/")[1]) + types.append(list(distinct)) + else: + types.append([]) ret["type"] = types ret["index"] = r["index"] return json.dumps(ret) @@ -294,6 +286,7 @@ def start(self, localhost=False, port=80): self.app.add_url_rule("/", "", self.handle_redirection) self.app.add_url_rule("/", "", self.handle_root) self.app.add_url_rule("/annotate", "annotate", self.handle_input, methods=['POST']) + self.app.add_url_rule("/annotate_token", "annotate_token", self.handle_tokenizer_input, methods=['POST']) self.app.add_url_rule("/annotate_mention", "annotate_mention", self.handle_mention_input, methods=['POST']) self.app.add_url_rule("/annotate_cache", "annotate_cache", self.handle_simple_input, methods=['POST']) self.app.add_url_rule("/annotate_vec", "annotate_vec", self.handle_word2vec_input, methods=['POST']) diff --git a/zoe_utils.py b/zoe_utils.py index 86545b9..0afa607 100644 --- a/zoe_utils.py +++ b/zoe_utils.py @@ -277,7 +277,7 @@ def rank_candidates_vec(self, sentence=None, candidates=None): target_vec = self.word2vec_helper(sentence.get_mention_surface()) if target_vec is None: print(sentence.get_mention_surface() + " not found in word2vec") - return candidates + return [(x, 0.0) for x in candidates] assert(len(target_vec) == 300) results = {} for candidate in candidates: