diff --git a/neuralcoref/cli/include/msvc9/stdint.h b/include/msvc9/stdint.h similarity index 100% rename from neuralcoref/cli/include/msvc9/stdint.h rename to include/msvc9/stdint.h diff --git a/include/murmurhash/MurmurHash2.h b/include/murmurhash/MurmurHash2.h new file mode 100644 index 0000000..6d7ccf4 --- /dev/null +++ b/include/murmurhash/MurmurHash2.h @@ -0,0 +1,22 @@ +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH2_H_ +#define _MURMURHASH2_H_ + +#include + +//----------------------------------------------------------------------------- + +uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ); +uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ); +uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed ); +uint32_t MurmurHash2A ( const void * key, int len, uint32_t seed ); +uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed ); +uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH2_H_ + diff --git a/include/murmurhash/MurmurHash3.h b/include/murmurhash/MurmurHash3.h new file mode 100644 index 0000000..9b4c3c9 --- /dev/null +++ b/include/murmurhash/MurmurHash3.h @@ -0,0 +1,28 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include + +//----------------------------------------------------------------------------- +#ifdef __cplusplus +extern "C" { +#endif + + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +#ifdef __cplusplus +} +#endif + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/neuralcoref/cli/include/numpy/__multiarray_api.h b/include/numpy/__multiarray_api.h similarity index 100% rename from neuralcoref/cli/include/numpy/__multiarray_api.h rename to include/numpy/__multiarray_api.h diff --git a/neuralcoref/cli/include/numpy/__ufunc_api.h b/include/numpy/__ufunc_api.h similarity index 100% rename from neuralcoref/cli/include/numpy/__ufunc_api.h rename to include/numpy/__ufunc_api.h diff --git a/neuralcoref/cli/include/numpy/_neighborhood_iterator_imp.h b/include/numpy/_neighborhood_iterator_imp.h similarity index 100% rename from neuralcoref/cli/include/numpy/_neighborhood_iterator_imp.h rename to include/numpy/_neighborhood_iterator_imp.h diff --git a/neuralcoref/cli/include/numpy/_numpyconfig.h b/include/numpy/_numpyconfig.h similarity index 100% rename from neuralcoref/cli/include/numpy/_numpyconfig.h rename to include/numpy/_numpyconfig.h diff --git a/neuralcoref/cli/include/numpy/arrayobject.h b/include/numpy/arrayobject.h similarity index 100% rename from neuralcoref/cli/include/numpy/arrayobject.h rename to include/numpy/arrayobject.h diff --git a/neuralcoref/cli/include/numpy/arrayscalars.h b/include/numpy/arrayscalars.h similarity index 100% rename from neuralcoref/cli/include/numpy/arrayscalars.h rename to include/numpy/arrayscalars.h diff --git a/neuralcoref/cli/include/numpy/halffloat.h b/include/numpy/halffloat.h similarity index 100% rename from neuralcoref/cli/include/numpy/halffloat.h rename to include/numpy/halffloat.h diff --git a/neuralcoref/cli/include/numpy/multiarray_api.txt b/include/numpy/multiarray_api.txt similarity index 100% rename from neuralcoref/cli/include/numpy/multiarray_api.txt rename to include/numpy/multiarray_api.txt diff --git a/neuralcoref/cli/include/numpy/ndarrayobject.h b/include/numpy/ndarrayobject.h similarity index 100% rename from neuralcoref/cli/include/numpy/ndarrayobject.h rename to include/numpy/ndarrayobject.h diff --git a/neuralcoref/cli/include/numpy/ndarraytypes.h b/include/numpy/ndarraytypes.h similarity index 100% rename from neuralcoref/cli/include/numpy/ndarraytypes.h rename to include/numpy/ndarraytypes.h diff --git a/neuralcoref/cli/include/numpy/noprefix.h b/include/numpy/noprefix.h similarity index 100% rename from neuralcoref/cli/include/numpy/noprefix.h rename to include/numpy/noprefix.h diff --git a/neuralcoref/cli/include/numpy/npy_3kcompat.h b/include/numpy/npy_3kcompat.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_3kcompat.h rename to include/numpy/npy_3kcompat.h diff --git a/neuralcoref/cli/include/numpy/npy_common.h b/include/numpy/npy_common.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_common.h rename to include/numpy/npy_common.h diff --git a/neuralcoref/cli/include/numpy/npy_cpu.h b/include/numpy/npy_cpu.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_cpu.h rename to include/numpy/npy_cpu.h diff --git a/neuralcoref/cli/include/numpy/npy_deprecated_api.h b/include/numpy/npy_deprecated_api.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_deprecated_api.h rename to include/numpy/npy_deprecated_api.h diff --git a/neuralcoref/cli/include/numpy/npy_endian.h b/include/numpy/npy_endian.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_endian.h rename to include/numpy/npy_endian.h diff --git a/neuralcoref/cli/include/numpy/npy_interrupt.h b/include/numpy/npy_interrupt.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_interrupt.h rename to include/numpy/npy_interrupt.h diff --git a/neuralcoref/cli/include/numpy/npy_math.h b/include/numpy/npy_math.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_math.h rename to include/numpy/npy_math.h diff --git a/neuralcoref/cli/include/numpy/npy_no_deprecated_api.h b/include/numpy/npy_no_deprecated_api.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_no_deprecated_api.h rename to include/numpy/npy_no_deprecated_api.h diff --git a/neuralcoref/cli/include/numpy/npy_os.h b/include/numpy/npy_os.h similarity index 100% rename from neuralcoref/cli/include/numpy/npy_os.h rename to include/numpy/npy_os.h diff --git a/neuralcoref/cli/include/numpy/numpyconfig.h b/include/numpy/numpyconfig.h similarity index 100% rename from neuralcoref/cli/include/numpy/numpyconfig.h rename to include/numpy/numpyconfig.h diff --git a/neuralcoref/cli/include/numpy/old_defines.h b/include/numpy/old_defines.h similarity index 100% rename from neuralcoref/cli/include/numpy/old_defines.h rename to include/numpy/old_defines.h diff --git a/neuralcoref/cli/include/numpy/oldnumeric.h b/include/numpy/oldnumeric.h similarity index 100% rename from neuralcoref/cli/include/numpy/oldnumeric.h rename to include/numpy/oldnumeric.h diff --git a/neuralcoref/cli/include/numpy/ufunc_api.txt b/include/numpy/ufunc_api.txt similarity index 100% rename from neuralcoref/cli/include/numpy/ufunc_api.txt rename to include/numpy/ufunc_api.txt diff --git a/neuralcoref/cli/include/numpy/ufuncobject.h b/include/numpy/ufuncobject.h similarity index 100% rename from neuralcoref/cli/include/numpy/ufuncobject.h rename to include/numpy/ufuncobject.h diff --git a/neuralcoref/cli/include/numpy/utils.h b/include/numpy/utils.h similarity index 100% rename from neuralcoref/cli/include/numpy/utils.h rename to include/numpy/utils.h diff --git a/neuralcoref/__init__.py b/neuralcoref/__init__.py index 79caa3e..fac4887 100644 --- a/neuralcoref/__init__.py +++ b/neuralcoref/__init__.py @@ -1,27 +1,69 @@ # coding: utf8 -from __future__ import unicode_literals +""" +Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py +To create the package for pypi. + +1. Change the version in __init__.py and setup.py. + +2. Commit these changes with the message: "Release: VERSION" + +3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " + Push the tag to git: git push --tags origin master + +4. Build both the sources and the wheel. Do not change anything in setup.py between + creating the wheel and the source distribution (obviously). + + For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory. + (this will build a wheel for the python version you use to build it - make sure you use python 3.x). + + For the sources, run: "python setup.py sdist" + You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp. + +5. Check that everything looks correct by uploading the package to the pypi test server: + + twine upload dist/* -r pypitest + (pypi suggest using twine as other methods upload files via plaintext.) + + Check that you can install it in a virtualenv by running: + pip install -i https://testpypi.python.org/pypi neuralcoref + +6. Upload the final version to actual pypi: + twine upload dist/* -r pypi + +7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. + +""" +from __future__ import unicode_literals, absolute_import + +import os import shutil import tarfile import tempfile import logging from .neuralcoref import NeuralCoref -from .file_utils import NEURALCOREF_CACHE, cached_path +from .file_utils import NEURALCOREF_MODEL_URL, NEURALCOREF_MODEL_PATH, NEURALCOREF_CACHE, cached_path -__all__ = ['NeuralCoref'] +__all__ = ['NeuralCoref', 'add_to_pipe'] +__version__ = "4.0.0" logger = logging.getLogger(__name__) -MODEL_URL = "https://s3.amazonaws.com/models.huggingface.co/neuralcoref/neuralcoref_model.tar.gz" -LOCAL_PATH = os.path.join(str(NEURALCOREF_CACHE), "/neuralcoref/") - -try: - local_model = cached_path(LOCAL_PATH) -except: - os.makedirs(LOCAL_PATH) - downloaded_model = cached_path(MODEL_URL) +if os.path.exists(NEURALCOREF_MODEL_PATH) and os.path.exists(os.path.join(NEURALCOREF_MODEL_PATH, "cfg")): + logger.info("Loading model from {}".format(NEURALCOREF_MODEL_PATH)) + local_model = cached_path(NEURALCOREF_MODEL_PATH) +else: + if not os.path.exists(NEURALCOREF_MODEL_PATH): + os.makedirs(NEURALCOREF_MODEL_PATH) + logger.info("Getting model from {} or cache".format(NEURALCOREF_MODEL_URL)) + downloaded_model = cached_path(NEURALCOREF_MODEL_URL) - logger.info("extracting archive file {} to dir {}".format(downloaded_model, LOCAL_PATH)) + logger.info("extracting archive file {} to dir {}".format(downloaded_model, NEURALCOREF_MODEL_PATH)) with tarfile.open(downloaded_model, 'r:gz') as archive: - archive.extractall(LOCAL_PATH) + archive.extractall(NEURALCOREF_CACHE) + +def add_to_pipe(nlp): + coref = NeuralCoref(nlp.vocab) + nlp.add_pipe(coref, name='neuralcoref') + return nlp diff --git a/neuralcoref/cli/__init__.py b/neuralcoref/cli/__init__.py deleted file mode 100644 index b6ac118..0000000 --- a/neuralcoref/cli/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .package import package -from .download import download \ No newline at end of file diff --git a/neuralcoref/cli/download.py b/neuralcoref/cli/download.py deleted file mode 100644 index bd43da8..0000000 --- a/neuralcoref/cli/download.py +++ /dev/null @@ -1,86 +0,0 @@ -# coding: utf8 -# Copied from spaCy repo - all glory to Explosion AI -from __future__ import unicode_literals - -import plac -import requests -import os -import subprocess -import sys - -from spacy.cli._messages import Messages -from spacy.util import prints, get_package_path -from spacy import util -from .. import about - -@plac.annotations( - model=("model to download, shortcut or name)", "positional", None, str), - direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool), - pip_args=("additional arguments to be passed to `pip install` when " - "installing the model")) -def download(model, direct=False, *pip_args): - """ - Download compatible model from default download path using pip. Model - can be shortcut, model name or, if --direct flag is set, full model name - with version. - """ - if direct: - dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args) - else: - shortcuts = get_json(about.__shortcuts__, "available shortcuts") - model_name = shortcuts.get(model, model) - compatibility = get_compatibility() - version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}' - .format(m=model_name, v=version), pip_args) - if dl != 0: # if download subprocess doesn't return 0, exit - sys.exit(dl) - try: - # Get package path here because link uses - # pip.get_installed_distributions() to check if model is a - # package, which fails if model was just installed via - # subprocess - package_path = get_package_path(model_name) - link(model_name, model, force=True, model_path=package_path) - except: - # Dirty, but since neuralcoref.download and the auto-linking is - # mostly a convenience wrapper, it's best to show a success - # message and loading instructions, even if linking fails. - prints(Messages.M001.format(name=model_name), title=Messages.M002) - - -def get_json(url, desc): - r = requests.get(url) - if r.status_code != 200: - prints(Messages.M004.format(desc=desc, version=about.__version__), - title=Messages.M003.format(code=r.status_code), exits=1) - return r.json() - - -def get_compatibility(): - version = about.__version__ - version = version.rsplit('.dev', 1)[0] - comp_table = get_json(about.__compatibility__, "compatibility table") - comp = comp_table['neuralcoref'] - if version not in comp: - prints(Messages.M006.format(version=version), title=Messages.M005, - exits=1) - return comp[version] - - -def get_version(model, comp): - model = model.rsplit('.dev', 1)[0] - if model not in comp: - prints(Messages.M007.format(name=model, version=about.__version__), - title=Messages.M005, exits=1) - return comp[model][0] - - -def download_model(filename, user_pip_args=None): - download_url = about.__download_url__ + '/' + filename - pip_args = ['--no-cache-dir', '--no-deps'] - if user_pip_args: - pip_args.extend(user_pip_args) - cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url] - return subprocess.call(cmd, env=os.environ.copy()) diff --git a/neuralcoref/cli/package.py b/neuralcoref/cli/package.py deleted file mode 100644 index 41e9333..0000000 --- a/neuralcoref/cli/package.py +++ /dev/null @@ -1,308 +0,0 @@ -# coding: utf8 -# Copied from spaCy repo - all glory to Explosion AI -from __future__ import unicode_literals - -import shutil -from pathlib import Path -import plac - -from srsly import json_dumps, read_json - -from spacy.cli._messages import Messages -from spacy.compat import path2str -from spacy.util import prints -from spacy import util -from spacy import about - - -@plac.annotations( - input_dir=("directory with model data", "positional", None, str), - output_dir=("output parent directory", "positional", None, str), - meta_path=("path to meta.json", "option", "m", str), - create_meta=("create meta.json, even if one exists in directory – if " - "existing meta is found, entries are shown as defaults in " - "the command line prompt", "flag", "c", bool), - force=("force overwriting of existing model directory in output directory", - "flag", "f", bool)) -def package(input_dir, output_dir, meta_path=None, create_meta=False, - force=False): - """ - Generate Python package for model data, including meta and required - installation files. A new directory will be created in the specified - output directory, and model data will be copied over. - """ - input_path = util.ensure_path(input_dir) - output_path = util.ensure_path(output_dir) - meta_path = util.ensure_path(meta_path) - if not input_path or not input_path.exists(): - prints(input_path, title=Messages.M008, exits=1) - if not output_path or not output_path.exists(): - prints(output_path, title=Messages.M040, exits=1) - if meta_path and not meta_path.exists(): - prints(meta_path, title=Messages.M020, exits=1) - - meta_path = meta_path or input_path / 'meta.json' - if meta_path.is_file(): - meta = read_json(meta_path) - if not create_meta: # only print this if user doesn't want to overwrite - prints(meta_path, title=Messages.M041) - else: - meta = generate_meta(input_dir, meta) - meta = validate_meta(meta, ['lang', 'name', 'version']) - model_name = meta['lang'] + '_' + meta['name'] - model_name_v = model_name + '-' + meta['version'] - main_path = output_path / model_name_v - package_path = main_path / model_name - bin_path = main_path / 'bin' - include_path = main_path / 'include' - orig_nc_path = Path(__file__).parent.parent - nc_path = package_path / 'neuralcoref' - - create_dirs(package_path, force) - create_dirs(bin_path, force) - create_dirs(nc_path, force) - - shutil.copytree(path2str(input_path), - path2str(package_path / model_name_v)) - - orig_include_path = path2str(Path(__file__).parent / 'include') - shutil.copytree(path2str(orig_include_path), - path2str(include_path)) - - nc1_path = path2str(orig_nc_path / 'neuralcoref.pyx') - nc2_path = path2str(orig_nc_path / 'neuralcoref.pxd') - shutil.copyfile(path2str(nc1_path), - path2str(nc_path / 'neuralcoref.pyx')) - shutil.copyfile(path2str(nc2_path), - path2str(nc_path / 'neuralcoref.pxd')) - create_file(nc_path / '__init__.py', TEMPLATE_INIT_NC) - create_file(nc_path / '__init__.pxd', TEMPLATE_INIT_PXD) - - orig_bin_path = path2str(Path(__file__).parent.parent.parent / 'bin' / 'cythonize.py') - shutil.copyfile(path2str(orig_bin_path), - path2str(bin_path / 'cythonize.py')) - - create_file(main_path / 'meta.json', json_dumps(meta)) - create_file(main_path / 'setup.py', TEMPLATE_SETUP) - create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) - create_file(package_path / '__init__.py', TEMPLATE_INIT.format(model_name)) - create_file(package_path / '__init__.pxd', TEMPLATE_INIT_PXD) - prints(main_path, Messages.M043, - title=Messages.M042.format(name=model_name_v)) - - -def create_dirs(package_path, force): - if package_path.exists(): - if force: - shutil.rmtree(path2str(package_path)) - else: - prints(package_path, Messages.M045, title=Messages.M044, exits=1) - Path.mkdir(package_path, parents=True) - - -def create_file(file_path, contents): - file_path.touch() - file_path.open('w', encoding='utf-8').write(contents) - - -def generate_meta(model_path, existing_meta): - meta = existing_meta or {} - settings = [('lang', 'Model language', meta.get('lang', 'en')), - ('name', 'Model name', meta.get('name', 'model')), - ('version', 'Model version', meta.get('version', '0.0.0')), - ('spacy_version', 'Required spaCy version', - '>=%s,<3.0.0' % about.__version__), - ('description', 'Model description', - meta.get('description', False)), - ('author', 'Author', meta.get('author', False)), - ('email', 'Author email', meta.get('email', False)), - ('url', 'Author website', meta.get('url', False)), - ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] - nlp = util.load_model_from_path(Path(model_path)) - meta['pipeline'] = nlp.pipe_names - meta['vectors'] = {'width': nlp.vocab.vectors_length, - 'vectors': len(nlp.vocab.vectors), - 'keys': nlp.vocab.vectors.n_keys} - prints(Messages.M047, title=Messages.M046) - for setting, desc, default in settings: - response = util.get_raw_input(desc, default) - meta[setting] = default if response == '' and default else response - if about.__title__ != 'spacy': - meta['parent_package'] = about.__title__ - return meta - - -def validate_meta(meta, keys): - for key in keys: - if key not in meta or meta[key] == '': - prints(Messages.M049, title=Messages.M048.format(key=key), exits=1) - return meta - - -TEMPLATE_SETUP = """ -#!/usr/bin/env python -from __future__ import print_function -import io -import os -from os import path, walk -import json -import sys -import contextlib -import subprocess -from shutil import copy -from distutils.sysconfig import get_python_inc -from distutils import ccompiler, msvccompiler -from setuptools import Extension, setup, find_packages - -PACKAGES = find_packages() - -PACKAGE_DATA = {'': ['*.pyx', '*.pxd']} - -def load_meta(fp): - with io.open(fp, encoding='utf8') as f: - return json.load(f) - - -def list_files(data_dir): - output = [] - for root, _, filenames in walk(data_dir): - for filename in filenames: - if not filename.startswith('.'): - output.append(path.join(root, filename)) - output = [path.relpath(p, path.dirname(data_dir)) for p in output] - output.append('meta.json') - return output - - -def list_requirements(meta): - parent_package = meta.get('parent_package', 'spacy') - requirements = [parent_package + ">=" + meta['spacy_version']] - if 'setup_requires' in meta: - requirements += meta['setup_requires'] - return requirements - -@contextlib.contextmanager -def chdir(new_dir): - old_dir = os.getcwd() - try: - os.chdir(new_dir) - sys.path.insert(0, new_dir) - yield - finally: - del sys.path[0] - os.chdir(old_dir) - - -def generate_cython(root, source): - print('Cythonizing sources in', source) - p = subprocess.call([sys.executable, - os.path.join(root, 'bin', 'cythonize.py'), - source], env=os.environ) - if p != 0: - raise RuntimeError('Running cythonize failed') - - -def is_source_release(model_path): - return os.path.exists(os.path.join(model_path, 'neuralcoref/neuralcoref.cpp')) - - -def setup_package(): - root = path.abspath(path.dirname(__file__)) - - with chdir(root): - meta_path = path.join(root, 'meta.json') - meta = load_meta(meta_path) - model_name = str(meta['lang'] + '_' + meta['name']) - model_dir = path.join(model_name, model_name + '-' + meta['version']) - - include_dirs = [ - get_python_inc(plat_specific=True), - os.path.join(root, 'include')] - - if (ccompiler.new_compiler().compiler_type == 'msvc' - and msvccompiler.get_build_version() == 9): - include_dirs.append(os.path.join(root, 'include', 'msvc9')) - - ext_modules = [] - mod_name = model_name + '.neuralcoref.neuralcoref' - mod_path = mod_name.replace('.', '/') + '.cpp' - extra_link_args = [] - # ??? - # Imported from patch from @mikepb - # See Issue #267. Running blind here... - if sys.platform == 'darwin': - dylib_path = ['..' for _ in range(mod_name.count('.'))] - dylib_path = '/'.join(dylib_path) - dylib_path = '@loader_path/%s/neuralcoref/platform/darwin/lib' % dylib_path - extra_link_args.append('-Wl,-rpath,%s' % dylib_path) - ext_modules.append( - Extension(mod_name, [mod_path], - language='c++', include_dirs=include_dirs, - extra_link_args=extra_link_args)) - - if not is_source_release(model_name): - generate_cython(root, model_name) - - copy(meta_path, path.join(model_name)) - copy(meta_path, model_dir) - package_data = PACKAGE_DATA - package_data[model_name] = list_files(model_dir) - setup( - name=model_name, - description=meta['description'], - author=meta['author'], - author_email=meta['email'], - url=meta['url'], - version=meta['version'], - license=meta['license'], - ext_modules=ext_modules, - packages=PACKAGES, - package_data=package_data, - install_requires=list_requirements(meta), - zip_safe=False, - ) - - -if __name__ == '__main__': - setup_package() -""".strip() - - -TEMPLATE_MANIFEST = """ -include meta.json -recursive-include include *.h -recursive-include bin *.py -""".strip() - - -TEMPLATE_INIT = """ -# coding: utf8 -from __future__ import unicode_literals - -from pathlib import Path -from spacy.util import load_model_from_init_py, get_model_meta -from {}.neuralcoref import NeuralCoref - -__version__ = get_model_meta(Path(__file__).parent)['version'] - - -def load(**overrides): - disable = overrides.get('disable', []) - if 'neuralcoref' in disable: - nlp = load_model_from_init_py(__file__, **overrides) - else: - overrides['disable'] = disable + ['neuralcoref'] - nlp = load_model_from_init_py(__file__, **overrides) - coref = NeuralCoref(nlp.vocab) - coref.from_disk(nlp.path / 'neuralcoref') - nlp.add_pipe(coref, name='neuralcoref') - return nlp -""".strip() - -TEMPLATE_INIT_NC = """ -from .neuralcoref import NeuralCoref -""".strip() - -TEMPLATE_INIT_PXD = """ -""".strip() - diff --git a/neuralcoref/file_utils.py b/neuralcoref/file_utils.py index da00ca1..ef027f9 100644 --- a/neuralcoref/file_utils.py +++ b/neuralcoref/file_utils.py @@ -31,6 +31,9 @@ except (AttributeError, ImportError): NEURALCOREF_CACHE = os.getenv('NEURALCOREF_CACHE', os.path.join(os.path.expanduser("~"), '.neuralcoref_cache')) +NEURALCOREF_MODEL_URL = "https://s3.amazonaws.com/models.huggingface.co/neuralcoref/neuralcoref.tar.gz" +NEURALCOREF_MODEL_PATH = os.path.join(str(NEURALCOREF_CACHE), "neuralcoref") + logger = logging.getLogger(__name__) # pylint: disable=invalid-name diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx index d3c5b31..30d6789 100644 --- a/neuralcoref/neuralcoref.pyx +++ b/neuralcoref/neuralcoref.pyx @@ -18,7 +18,6 @@ cimport cython from cpython cimport array import array from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t -import cytoolz import numpy from cymem.cymem cimport Pool @@ -44,6 +43,8 @@ from thinc.v2v import Model, ReLu, Affine from thinc.api import chain, clone # from thinc.neural.util import get_array_module +from .file_utils import NEURALCOREF_MODEL_PATH + ############################## ##### A BUNCH OF SIZES ####### @@ -542,6 +543,9 @@ cdef class NeuralCoref(object): Token.set_extension('coref_clusters', getter=self.token_clusters) Token.set_extension('coref_scores', getter=self.token_scores) + # Load from disk + self.from_disk(NEURALCOREF_MODEL_PATH) + def __reduce__(self): return (NeuralCoref, (self.vocab, self.model), None, None) @@ -595,7 +599,7 @@ cdef class NeuralCoref(object): conv_dict = self.cfg.get('conv_dict', None) if blacklist is None: blacklist = self.cfg.get('blacklist', True) - for docs in cytoolz.partition_all(batch_size, stream): + for docs in util.minibatch(stream, size=batch_size): docs = list(docs) annotations = self.predict(docs, greedyness=greedyness, max_dist=max_dist, max_dist_match=max_dist_match, blacklist=blacklist) diff --git a/neuralcoref/tests/__init__.py b/neuralcoref/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index ed70eb4..e571c6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ spacy cython -numpy \ No newline at end of file +numpy +boto3 +requests \ No newline at end of file diff --git a/setup.py b/setup.py index b845fe0..e52a46f 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,31 @@ #!/usr/bin/env python from __future__ import print_function +import io import os import subprocess import sys import contextlib -import textwrap -import pkg_resources from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc +import distutils.util from distutils import ccompiler, msvccompiler from setuptools import Extension, setup, find_packages +def is_new_osx(): + """Check whether we're on OSX >= 10.10""" + name = distutils.util.get_platform() + if sys.platform != "darwin": + return False + elif name.startswith("macosx-10"): + minor_version = int(name.split("-")[1].split(".")[1]) + if minor_version >= 7: + return True + else: + return False + else: + return False + + PACKAGE_DATA = {'': ['*.pyx', '*.pxd'], '': ['*.h'],} @@ -20,21 +35,79 @@ MOD_NAMES = ['neuralcoref.neuralcoref'] -def is_installed(requirement): - try: - pkg_resources.require(requirement) - except pkg_resources.ResolutionError: - return False - else: - return True -if not is_installed('numpy>=1.11.0') or not is_installed('spacy>=2.0.4'): - print(textwrap.dedent(""" - Error: requirements needs to be installed first. - You can install them via: - $ pip install -r requirements.txt - """), file=sys.stderr) - exit(1) + +COMPILE_OPTIONS = { + "msvc": ["/Ox", "/EHsc"], + "mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], + "other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"], +} + + +LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []} + + +if is_new_osx(): + # On Mac, use libc++ because Apple deprecated use of + # libstdc + COMPILE_OPTIONS["other"].append("-stdlib=libc++") + LINK_OPTIONS["other"].append("-lc++") + # g++ (used by unix compiler on mac) links to libstdc++ as a default lib. + # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc + LINK_OPTIONS["other"].append("-nodefaultlibs") + + +USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None +if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1": + if sys.platform == "darwin": + COMPILE_OPTIONS["other"].append("-fopenmp") + LINK_OPTIONS["other"].append("-fopenmp") + PACKAGE_DATA["spacy.platform.darwin.lib"] = ["*.dylib"] + PACKAGES.append("spacy.platform.darwin.lib") + + elif sys.platform == "win32": + COMPILE_OPTIONS["msvc"].append("/openmp") + + else: + COMPILE_OPTIONS["other"].append("-fopenmp") + LINK_OPTIONS["other"].append("-fopenmp") + + +# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options +# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used +class build_ext_options: + def build_options(self): + for e in self.extensions: + e.extra_compile_args += COMPILE_OPTIONS.get( + self.compiler.compiler_type, COMPILE_OPTIONS["other"] + ) + for e in self.extensions: + e.extra_link_args += LINK_OPTIONS.get( + self.compiler.compiler_type, LINK_OPTIONS["other"] + ) + + +class build_ext_subclass(build_ext, build_ext_options): + def build_extensions(self): + build_ext_options.build_options(self) + build_ext.build_extensions(self) + + +# def is_installed(requirement): +# try: +# pkg_resources.require(requirement) +# except pkg_resources.ResolutionError: +# return False +# else: +# return True + +# if not is_installed('numpy>=1.11.0') or not is_installed('spacy>=2.1.0'): +# print(textwrap.dedent(""" +# Error: requirements needs to be installed first. +# You can install them via: +# $ pip install -r requirements.txt +# """), file=sys.stderr) +# exit(1) @contextlib.contextmanager def chdir(new_dir): @@ -70,11 +143,11 @@ def setup_package(): include_dirs = [ get_python_inc(plat_specific=True), - os.path.join(root, 'neuralcoref', 'cli', 'include')] + os.path.join(root, 'include')] if (ccompiler.new_compiler().compiler_type == 'msvc' and msvccompiler.get_build_version() == 9): - include_dirs.append(os.path.join(root, 'neuralcoref', 'cli', 'include', 'msvc9')) + include_dirs.append(os.path.join(root, 'include', 'msvc9')) ext_modules = [] for mod_name in MOD_NAMES: @@ -94,30 +167,41 @@ def setup_package(): extra_link_args=extra_link_args)) setup(name='neuralcoref', - version='3.1', + version='4.0', description="Coreference Resolution in spaCy with Neural Networks", url='https://github.com/huggingface/neuralcoref', author='Thomas Wolf', author_email='thomwolf@gmail.com', ext_modules=ext_modules, - include_dirs=[numpy.get_include()], classifiers=[ 'Development Status :: 3 - Alpha', 'Environment :: Console', 'Intended Audience :: Developers', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.6' + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Cython", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: Scientific/Engineering", ], - install_requires=['numpy', 'spacy>=2.1.0', 'cytoolz'], + install_requires=['numpy>=1.15.0', 'spacy>=2.1.0'], + setup_requires=["wheel"], + python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", packages=PACKAGES, package_data=PACKAGE_DATA, keywords='NLP chatbots coreference resolution', license='MIT', zip_safe=False, - platforms='any') + platforms='any', + cmdclass={"build_ext": build_ext_subclass}) if __name__ == '__main__': setup_package()