From 5abfd3a50196e1774e1536ff3e763f60e87dd890 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 14 Nov 2022 20:36:51 -0800 Subject: [PATCH 1/4] update CI --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 048da1c9a0..0923407bb7 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -64,7 +64,7 @@ jobs: python -m pip install --upgrade pip python -m pip install setuptools pytest pytest-cov contextvars python -m pip install --upgrade cython - python -m pip install --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python + python -m pip install --pre "mxnet>=2.0.0b20220121" -f https://dist.mxnet.io/python python -m pip install -U -e .[extras,dev] - name: Build and Install TVM if: matrix.os == 'ubuntu-latest' From 2ce9709de9749e0ed3f393a8a6cd5f664ffcbc78 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 14 Nov 2022 20:40:41 -0800 Subject: [PATCH 2/4] update CI --- .github/workflows/unittests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 0923407bb7..048da1c9a0 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -64,7 +64,7 @@ jobs: python -m pip install --upgrade pip python -m pip install setuptools pytest pytest-cov contextvars python -m pip install --upgrade cython - python -m pip install --pre "mxnet>=2.0.0b20220121" -f https://dist.mxnet.io/python + python -m pip install --pre "mxnet>=2.0.0b20210121" -f https://dist.mxnet.io/python python -m pip install -U -e .[extras,dev] - name: Build and Install TVM if: matrix.os == 'ubuntu-latest' From 1c105ccbc5beb3b0e1ba997689ce1b93490c3ed6 Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 14 Nov 2022 21:09:34 -0800 Subject: [PATCH 3/4] update word_embedding --- .../word_embedding/word_embedding.md | 27 +++++++++---------- setup.py | 2 +- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/docs/tutorials/word_embedding/word_embedding.md b/docs/tutorials/word_embedding/word_embedding.md index 6557630e80..7c80a095b3 100644 --- a/docs/tutorials/word_embedding/word_embedding.md +++ b/docs/tutorials/word_embedding/word_embedding.md @@ -33,11 +33,11 @@ To begin, let's first import a few packages that we'll need for this example: import warnings warnings.filterwarnings('ignore') -from mxnet import gluon, nd +from mxnet import gluon, np import gluonnlp as nlp import re import collections -import numpy as np +import numpy as onp ``` @@ -160,7 +160,7 @@ For example, ```{.python .input} def simple(words): - return np.ones((len(words), 300)) + return onp.ones((len(words), 300)) matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple) ``` @@ -217,7 +217,7 @@ input_dim, output_dim = matrix.shape layer = gluon.nn.Embedding(input_dim, output_dim) layer.initialize() layer.weight.set_data(matrix) -layer(nd.array([5, 4]))[:, :5] +layer(np.array([5, 4]))[:, :5] ``` ### Creating Vocabulary from Pre-trained Word Embeddings @@ -257,18 +257,17 @@ To apply word embeddings, we need to define cosine similarity. Cosine similarity determines the similarity between two vectors. ```{.python .input} -import numpy as np def cos_sim(x, y): - return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + return onp.dot(x, y) / (onp.linalg.norm(x) * onp.linalg.norm(y)) ``` The range of cosine similarity between two vectors can be between -1 and 1. The larger the value, the larger the similarity between the two vectors. ```{.python .input} -x = np.array([1, 2]) -y = np.array([10, 20]) -z = np.array([-1, -2]) +x = onp.array([1, 2]) +y = onp.array([10, 20]) +z = onp.array([-1, -2]) print(cos_sim(x, y)) print(cos_sim(x, z)) @@ -287,16 +286,16 @@ We can then find the indices for which the dot product is greatest (`topk`), whi ```{.python .input} def norm_vecs_by_row(x): - return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1)) + return x / onp.sqrt(onp.sum(x * x, axis=1) + 1E-10).reshape((-1,1)) def topk(res, k): - part = np.argpartition(res, -k)[-k:] - return part[np.argsort(res[part])].tolist()[::-1] + part = onp.argpartition(res, -k)[-k:] + return part[onp.argsort(res[part])].tolist()[::-1] def get_knn(vocab, matrix, k, word): word_vec = matrix[vocab[word]].reshape((-1, 1)) vocab_vecs = norm_vecs_by_row(matrix) - dot_prod = np.dot(vocab_vecs, word_vec) + dot_prod = onp.dot(vocab_vecs, word_vec) indices = topk(dot_prod.reshape((len(vocab), )), k=k+1) # Remove unknown and input tokens. return vocab.to_tokens(indices[1:]) @@ -351,7 +350,7 @@ def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3): word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]] word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1)) vocab_vecs = norm_vecs_by_row(matrix) - dot_prod = np.dot(vocab_vecs, word_diff) + dot_prod = onp.dot(vocab_vecs, word_diff) indices = topk(dot_prod.reshape((len(vocab), )), k=k) return vocab.to_tokens(indices) ``` diff --git a/setup.py b/setup.py index baf44e6110..0297e4eeed 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ def find_version(*file_paths): 'contextvars', 'pyarrow', 'sentencepiece==0.1.95', - 'protobuf', + 'protobuf<=3.20.1', 'pandas', 'tokenizers==0.9.4', 'dataclasses;python_version<"3.7"', # Dataclass for python <= 3.6 From dcd7330977613b694a1d844ff9553c7749e1955d Mon Sep 17 00:00:00 2001 From: barry-jin Date: Mon, 14 Nov 2022 23:20:51 -0800 Subject: [PATCH 4/4] fix unittest --- tests/test_utils_misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index de6b3198aa..6515cdf7d5 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -52,6 +52,7 @@ def test_download_s3(overwrite): overwrite=overwrite) +@pytest.mark.skip("RuntimeError: Failed downloading url https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2014-41/cc-index.paths.gz") @pytest.mark.remote_required @pytest.mark.parametrize('overwrite', [False, True]) def test_download_https(overwrite):