From 73870047b64a36950360c5671b3ee3b707537846 Mon Sep 17 00:00:00 2001
From: Xuanyu Zhou <zhouxuanyu.personal@gmail.com>
Date: Sun, 28 Oct 2018 11:29:27 -0500
Subject: [PATCH 1/3] fix install process

---
 README.md        |  5 +++++
 install.sh       | 27 +++++++++++++++++++++++++++
 requirements.txt |  1 -
 server.py        |  6 +++++-
 zoe_utils.py     |  3 +++
 5 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 08d442e..a111478 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@ To this end, we are working on an online demo, and we plan to release it before
 * Minimum 16G available disk space and 16G memory. (Lower specs will not work)
 * Python 3.X (Mostly tested on 3.5)
 * A POSIX OS (Windows not tested)
+* Java JDK and Maven
 * `virtualenv` if you are installing with script (check if `virtualenv` command works)
 * `wget` if you are installing with script (Use brew to install it on OSX)
 * `unzip` if you are installing with script
@@ -60,6 +61,10 @@ Currently you can do the following:
 * Run experiment on BBN test set: `python3 main.py bbn`
 * Run experiment on the first 1000 Ontonotes_fine test set instances (due to size issue): `python3 main.py ontonotes`
 
+Additionally, you can run server mode that initializes an online demo with `python3 server.py`
+However, this requires some additional file that's not provided for download yet.
+Please directly contact the authors.
+
 It's generally an expensive operation to run on new sentences, but you can still do it.
 Please refer to `main.py` to see how you can test on your own data. 
 
diff --git a/install.sh b/install.sh
index fc25240..8f8ec06 100644
--- a/install.sh
+++ b/install.sh
@@ -1,5 +1,31 @@
 #!/bin/bash
 
+if ! [ -x "$(command -v java)" ]; then
+    echo 'Error: Java in not installed.'
+    exit 1
+fi
+if ! [ -x "$(command -v mvn)" ]; then
+    echo 'Error: maven is not installed.'
+    exit 1
+fi
+if ! [ -x "$(command -v python3)" ]; then
+    echo 'Error: python 3.x is not installed.'
+    exit 1
+fi
+if ! [ -x "$(command -v virtualenv)" ]; then
+    echo 'Error: virtualenv is not installed.'
+    exit 1
+fi
+if ! [ -x "$(command -v wget)" ]; then
+    echo 'Error: wget is not found. Either install or find replacement and modify this script.'
+    exit 1
+fi
+if ! [ -x "$(command -v unzip)" ]; then
+    echo 'Error: unzip is not found. Either install or find replacement and modify this script.'
+    exit 1
+fi
+echo 'All dependencies satisfied. Moving on...'
+
 virtualenv -p python3 venv
 cd ./bilm-tf
 ../venv/bin/python3 setup.py install
@@ -7,6 +33,7 @@ wget http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/model.zip
 unzip model.zip
 rm model.zip
 cd ../
+venv/bin/pip3 install Cython
 venv/bin/pip3 install -r requirements.txt
 wget http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/data.zip
 unzip -n data.zip
diff --git a/requirements.txt b/requirements.txt
index 4df8f04..184112f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,5 @@ scipy
 regex
 Flask
 flask-cors
-cython
 ccg_nlpy
 gensim
\ No newline at end of file
diff --git a/server.py b/server.py
index bbb9d45..52f0b93 100644
--- a/server.py
+++ b/server.py
@@ -30,7 +30,11 @@ def __init__(self, sql_db_path, surface_cache_path):
         self.pipeline = local_pipeline.LocalPipeline()
         self.pipeline_initialize_helper(['.'])
         self.runner = ZoeRunner(allow_tensorflow=True)
-        self.runner.elmo_processor.load_sqlite_db(sql_db_path, server_mode=True)
+        status = self.runner.elmo_processor.load_sqlite_db(sql_db_path, server_mode=True)
+        if not status:
+            print("ELMo cache file is not found. Server mode is prohibited without it.")
+            print("Please contact the author for this cache, or modify this code if you know what you are doing.")
+            exit(1)
         self.runner.elmo_processor.rank_candidates_vec()
         signal.signal(signal.SIGINT, self.grace_end)
 
diff --git a/zoe_utils.py b/zoe_utils.py
index c74efbc..86545b9 100644
--- a/zoe_utils.py
+++ b/zoe_utils.py
@@ -37,10 +37,13 @@ def __init__(self, allow_tensorflow):
         self.word2vec = None
 
     def load_sqlite_db(self, path, server_mode=False):
+        if not os.path.isfile(path):
+            return False
         self.db_conn = sqlite3.connect(path)
         self.db_path = path
         self.server_mode = server_mode
         self.db_loaded = True
+        return True
 
     def query_sqlite_db(self, candidates):
         if not self.db_loaded:

From d9c5654c29dc4bc10aa6f0d387fd1cc00222040e Mon Sep 17 00:00:00 2001
From: Xuanyu Zhou <zhouxuanyu.personal@gmail.com>
Date: Sun, 28 Oct 2018 11:54:28 -0500
Subject: [PATCH 2/3] shorten README

---
 README.md | 101 +++++++++++-------------------------------------------
 1 file changed, 20 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index a111478..2f8e3c8 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# ZOE (Zero-shot Open Typing)
+# ZOE (Zero-shot Open Entity Typing)
 A state of the art system for zero-shot entity fine typing with minimum supervision
 
 ## Introduction
@@ -7,17 +7,17 @@ This is a demo system for our paper "Zero-Shot Open Entity Typing as Type-Compat
 which at the time of publication represents the state-of-the-art of zero-shot entity typing.
 
 The original experiments that produced all the results in the paper
-are done with a package written in Java. This is a re-written package 
-that contains the same core, without experimental code. It's solely for
+are done with a package written in Java. This is a re-written package solely for
 the purpose of demoing the algorithm and validating key results. 
 
-The results may slightly differ from published numbers, due to the randomness in Java's 
-HashSet and Python set's iteration order. The difference should be within 0.5%.
+The results may be slightly different with published numbers, due to the randomness in Java's 
+HashSet and Python set's iteration order. The difference should be negligible.
 
-A major flaw of this system is the speed of running new sentences, due to ELMo processing.
-We have cached ELMo results for the provided experiments to make running experiments possible.
+This system may take a long time if ran on a large number of new sentences, due to ELMo processing.
+We have cached ELMo results for the provided experiments.
 
-To this end, we are working on an online demo, and we plan to release it before EMNLP 2018.
+The package also contains an online demo, please refer to [Publication Page](http://cogcomp.org/page/publication_view/845)
+for more details.
 
 ## Usage
 
@@ -25,100 +25,39 @@ To this end, we are working on an online demo, and we plan to release it before
 
 #### Prerequisites
 
-* Minimum 16G available disk space and 16G memory. (Lower specs will not work)
+* Minimum 20G available disk space and 16G memory. (strict requirement)
 * Python 3.X (Mostly tested on 3.5)
-* A POSIX OS (Windows not tested)
+* A POSIX OS (Windows not supported)
 * Java JDK and Maven
-* `virtualenv` if you are installing with script (check if `virtualenv` command works)
+* `virtualenv` if you are installing with script
 * `wget` if you are installing with script (Use brew to install it on OSX)
 * `unzip` if you are installing with script
 
-#### Install using a shell script
+#### Install using a one-line command
 
-To make everyone's life easier, we have provided a simple way for install, simply run `sh install.sh`.
+To make life easier, we provide a simple way to install with `sh install.sh`.
 
 This script does everything mentioned in the next section, plus creating a virtualenv. Use `source venv/bin/activate` to activate.
 
 #### Install manually
 
-Generally it's recommended to create a Python3 virtualenv and work under it.
-
-You need to first install AllenAI's bilm-tf package by running `python3 setup.py install` in ./bilm-tf directory
-
-Then install requirements by `pip3 install -r requirements.txt` in project root
-
-Then you need to download all the data/model files. There are two steps in this:
-* in bilm-tf/, download [model.zip](http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/model.zip), and uncompress
-* project root, download [data.zip](http://cogcomp.org/Data/ccgPapersData/xzhou45/zoe/data.zip), and uncompress
-
-Then check if all files are here by `python3 scripts.py CHECKFILES` or `python3 scripts.py CHECKFILES figer`
-in order to check figer caches etc.
+See wiki [manual-installation](https://github.com/CogComp/zoe/wiki/Manual-Installation)
 
 ### Run the system
 
-Currently you can do the following:
+Currently you can do the following without changes to the code:
 * Run experiment on FIGER test set (randomly sampled as the paper): `python3 main.py figer`
 * Run experiment on BBN test set: `python3 main.py bbn`
 * Run experiment on the first 1000 Ontonotes_fine test set instances (due to size issue): `python3 main.py ontonotes`
 
-Additionally, you can run server mode that initializes an online demo with `python3 server.py`
-However, this requires some additional file that's not provided for download yet.
+Additionally, you can run server mode that initializes the online demo with `python3 server.py`
+However, this requires some additional files that's not provided for download yet.
 Please directly contact the authors.
 
-It's generally an expensive operation to run on new sentences, but you can still do it.
-Please refer to `main.py` to see how you can test on your own data. 
-
-## Engineering details
-
-### Structure
-
-The package is composed with 
-
-* A slightly modified ELMo source code, see [bilm-tf](https://github.com/allenai/bilm-tf)
-* A main library `zoe_utils.py`
-* A executor `main.py`
-* A script helper `script.py` 
-
-### zoe_utils.py
-
-This is the main library file which contains the core logic.
-
-It has 4 main component Classes:
-
-#### `EsaProcessor`
-
-Supports all operations related to ESA and its data files. 
-
-A main entrance is `EsaProcessor.get_candidates` which given a sentence, returns 
-the top `EsaProcessor.RETURN_NUM` candidate Wikipedia concepts
-
-#### `ElmoProcessor`
-
-Supports all operations related to ElMo and its data files.
-
-A main entrance is `ElmoProcessor.rank_candidates`, which given a sentence and a list 
-of candidates (generated from ESA), rank them by ELMo representation cosine similarities. (see paper)
-
-It will return the top `ElmoProcessor.RANKED_RETURN_NUM` candidates.
-
-#### `InferenceProcessor`
-
-This is the core engine that does inference given outputs from the previous processors.
-
-The logic behind it is as described in the paper and is rather complicated. 
-
-One main entrance is `InferenceProcessor.inference` which receives a sentence, outputs from 
-previously mentioned processors, and set inference results.
-
-#### `Evaluator`
-
-This evaluates performances and print them, after given a list of sentences processed by
-`InferenceProcessor`
-
-#### `DataReader`
+It's generally an expensive operation to run on large numerb of new sentences, but you are welcome to do it.
+Please refer to `main.py` and [Engineering Details](https://github.com/CogComp/zoe/wiki/Engineering-Details) 
+to see how you can test on your own data. 
 
-Initialize this with a data file path. It reads standard json formats (see examples)
-and transform the data into a list of `Sentence`
 
 ## Citation
 See the following paper: 

From a9f4a027806b981976cd12e8368d52ea8fd40295 Mon Sep 17 00:00:00 2001
From: Xuanyu Zhou <zhouxuanyu.personal@gmail.com>
Date: Sun, 28 Oct 2018 16:11:43 -0500
Subject: [PATCH 3/3] tokenizer + several bug fixes

---
 frontend/index.html | 38 +++++++++++++++---
 server.py           | 93 +++++++++++++++++++++------------------------
 zoe_utils.py        |  2 +-
 3 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/frontend/index.html b/frontend/index.html
index 594acb2..96536bd 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -218,7 +218,23 @@
             alert("You must enter a sentence to proceed.");
             return;
         }
-        var tokens = sentence.trim().split(" ");
+        let xhr = new XMLHttpRequest();
+        xhr.open("POST", SERVER_API + "annotate_token", true);
+        xhr.setRequestHeader("Content-Type", "application/json");
+        xhr.onreadystatechange = function () {
+            if (xhr.readyState === XMLHttpRequest.DONE && xhr.status === 200) {
+                var json = JSON.parse(xhr.responseText);
+                continueGenerateTokens(json);
+            }
+        };
+        var data = JSON.stringify({
+            sentence: sentence,
+        });
+        xhr.send(data);
+    }
+
+    function continueGenerateTokens(result) {
+        var tokens = result["tokens"];
         document.getElementById("total-token-num").innerText = String(tokens.length);
         for (var i = 0; i < tokens.length; i++) {
             var curToken = tokens[i];
@@ -262,6 +278,16 @@
         document.getElementById("using-preset-example").innerText = String(-1);
     }
 
+    function getTokens() {
+        var parent_div = document.getElementById("token-display");
+        var i;
+        var tokens = [];
+        for (i = 0; i < parent_div.children.length; i++) {
+            tokens.push(parent_div.children[i].innerHTML);
+        }
+        return tokens;
+    }
+
     function generatePresetMentions() {
         var sentence = document.getElementById("sentence-input").value;
         var xhr = new XMLHttpRequest();
@@ -274,7 +300,7 @@
             }
         };
         var data = JSON.stringify({
-            tokens: sentence.trim().split(" "),
+            tokens: getTokens(),
         });
         xhr.send(data);
     }
@@ -504,7 +530,7 @@
             };
             var data_vec = JSON.stringify({
                 index: i,
-                tokens: sentence.trim().split(" "),
+                tokens: getTokens(),
                 mention_starts: [mention_starts[i]],
                 mention_ends: [mention_ends[i]],
             });
@@ -521,7 +547,7 @@
             };
             var data_simple = JSON.stringify({
                 index: i,
-                tokens: sentence.trim().split(" "),
+                tokens: getTokens(),
                 mention_starts: [mention_starts[i]],
                 mention_ends: [mention_ends[i]],
             });
@@ -538,7 +564,7 @@
             };
             var data = JSON.stringify({
                 index: i,
-                tokens: sentence.trim().split(" "),
+                tokens: getTokens(),
                 mention_starts: [mention_starts[i]],
                 mention_ends: [mention_ends[i]],
                 mode: getInferenceMode(),
@@ -634,7 +660,7 @@
 
     function getExampleSentenceMention(id) {
         if (id == 1) {
-            return [[0, 2], [10, 12], [15, 17]];
+            return [[0, 2], [11, 13], [16, 18]];
         }
         if (id == 2) {
             return [[0, 1], [5, 7], [9, 11], [20, 21]];
diff --git a/server.py b/server.py
index 52f0b93..695d761 100644
--- a/server.py
+++ b/server.py
@@ -112,53 +112,31 @@ def handle_input(self):
         if mode != "figer":
             if mode != "custom":
                 selected_inference_processor = InferenceProcessor(mode, resource_loader=self.runner.inference_processor)
-                for sentence in sentences:
-                    sentence.set_signature(selected_inference_processor.signature())
-                    cached = self.mem_cache.query_cache(sentence)
-                    if cached is not None:
-                        sentence = cached
-                    else:
-                        self.runner.process_sentence(sentence, selected_inference_processor)
-                        self.mem_cache.insert_cache(sentence)
-                        self.surface_cache.insert_cache(sentence)
-                    predicted_types.append(list(sentence.predicted_types))
-                    predicted_candidates.append(sentence.elmo_candidate_titles)
-                    mentions.append(sentence.get_mention_surface_raw())
-                    selected_candidates.append(sentence.selected_title)
-                    other_possible_types.append(sentence.could_also_be_types)
             else:
                 rules = r["taxonomy"]
                 mappings = self.parse_custom_rules(rules)
-                custom_inference_processor = InferenceProcessor(mode, custom_mapping=mappings)
-                for sentence in sentences:
-                    sentence.set_signature(custom_inference_processor.signature())
-                    cached = self.mem_cache.query_cache(sentence)
-                    if cached is not None:
-                        sentence = cached
-                    else:
-                        self.runner.process_sentence(sentence, custom_inference_processor)
-                        self.mem_cache.insert_cache(sentence)
-                        self.surface_cache.insert_cache(sentence)
-                    predicted_types.append(list(sentence.predicted_types))
-                    predicted_candidates.append(sentence.elmo_candidate_titles)
-                    mentions.append(sentence.get_mention_surface_raw())
-                    selected_candidates.append(sentence.selected_title)
-                    other_possible_types.append(sentence.could_also_be_types)
+                selected_inference_processor = InferenceProcessor(mode, custom_mapping=mappings)
         else:
-            for sentence in sentences:
-                sentence.set_signature(self.runner.inference_processor.signature())
-                cached = self.mem_cache.query_cache(sentence)
-                if cached is not None:
-                    sentence = cached
-                else:
-                    self.runner.process_sentence(sentence)
+            selected_inference_processor = self.runner.inference_processor
+
+        for sentence in sentences:
+            sentence.set_signature(selected_inference_processor.signature())
+            cached = self.mem_cache.query_cache(sentence)
+            if cached is not None:
+                sentence = cached
+            else:
+                self.runner.process_sentence(sentence, selected_inference_processor)
+                try:
                     self.mem_cache.insert_cache(sentence)
                     self.surface_cache.insert_cache(sentence)
-                predicted_types.append(list(sentence.predicted_types))
-                predicted_candidates.append(sentence.elmo_candidate_titles)
-                mentions.append(sentence.get_mention_surface_raw())
-                selected_candidates.append(sentence.selected_title)
-                other_possible_types.append(sentence.could_also_be_types)
+                except:
+                    print("Cache insertion exception. Ignored.")
+            predicted_types.append(list(sentence.predicted_types))
+            predicted_candidates.append(sentence.elmo_candidate_titles)
+            mentions.append(sentence.get_mention_surface_raw())
+            selected_candidates.append(sentence.selected_title)
+            other_possible_types.append(sentence.could_also_be_types)
+
         elapsed_time = time.time() - start_time
         print("Processed mention " + str([x.get_mention_surface() for x in sentences]) + " in mode " + mode + ". TIME: " + str(elapsed_time) + " seconds.")
         ret["type"] = predicted_types
@@ -176,6 +154,17 @@ def pipeline_initialize_helper(self, tokens):
         doc.get_ner_ontonotes
         doc.get_view("MENTION")
 
+    def handle_tokenizer_input(self):
+        r = request.get_json()
+        ret = {"tokens": []}
+        if "sentence" not in r:
+            return json.dumps(ret)
+        doc = self.pipeline.doc(r["sentence"])
+        token_view = doc.get_tokens
+        for cons in token_view:
+            ret["tokens"].append(str(cons))
+        return json.dumps(ret)
+
     """
     Handles requests for mention filling
     """
@@ -209,12 +198,12 @@ def handle_mention_input(self):
                 for cons in additions_view:
                     add_to_list = True
                     if additions_view.view_name != "MENTION":
-                        start = cons['start']
-                        end = cons['end']
+                        start = int(cons['start'])
+                        end = int(cons['end'])
                     else:
-                        start = cons['properties']['EntityHeadStartSpan']
-                        end = cons['properties']['EntityHeadEndSpan']
-                    for i in range(start - 1, end + 1):
+                        start = int(cons['properties']['EntityHeadStartSpan'])
+                        end = int(cons['properties']['EntityHeadEndSpan'])
+                    for i in range(max(start - 1, 0), min(len(tokens), end + 1)):
                         if i in ret_set:
                             add_to_list = False
                             break
@@ -246,10 +235,13 @@ def handle_simple_input(self):
         for sentence in sentences:
             surface = sentence.get_mention_surface()
             cached_types = self.surface_cache.query_cache(surface)
-            distinct = set()
-            for t in cached_types:
-                distinct.add("/" + t.split("/")[1])
-            types.append(list(distinct))
+            if cached_types is not None:
+                distinct = set()
+                for t in cached_types:
+                    distinct.add("/" + t.split("/")[1])
+                types.append(list(distinct))
+            else:
+                types.append([])
         ret["type"] = types
         ret["index"] = r["index"]
         return json.dumps(ret)
@@ -294,6 +286,7 @@ def start(self, localhost=False, port=80):
         self.app.add_url_rule("/", "", self.handle_redirection)
         self.app.add_url_rule("/<path:path>", "<path:path>", self.handle_root)
         self.app.add_url_rule("/annotate", "annotate", self.handle_input, methods=['POST'])
+        self.app.add_url_rule("/annotate_token", "annotate_token", self.handle_tokenizer_input, methods=['POST'])
         self.app.add_url_rule("/annotate_mention", "annotate_mention", self.handle_mention_input, methods=['POST'])
         self.app.add_url_rule("/annotate_cache", "annotate_cache", self.handle_simple_input, methods=['POST'])
         self.app.add_url_rule("/annotate_vec", "annotate_vec", self.handle_word2vec_input, methods=['POST'])
diff --git a/zoe_utils.py b/zoe_utils.py
index 86545b9..0afa607 100644
--- a/zoe_utils.py
+++ b/zoe_utils.py
@@ -277,7 +277,7 @@ def rank_candidates_vec(self, sentence=None, candidates=None):
         target_vec = self.word2vec_helper(sentence.get_mention_surface())
         if target_vec is None:
             print(sentence.get_mention_surface() + " not found in word2vec")
-            return candidates
+            return [(x, 0.0) for x in candidates]
         assert(len(target_vec) == 300)
         results = {}
         for candidate in candidates: