From 4bbb9921c57bed3a0a49c3b4a0ee8d468514e3b3 Mon Sep 17 00:00:00 2001 From: Ndamulelo Nemakhavhani Date: Sat, 1 Jun 2024 15:42:43 +0000 Subject: [PATCH 1/5] wip --- our_stopwords/__init__.py | 0 our_stopwords/_main__.py | 0 our_stopwords/data/nso.jsonl | 1 + our_stopwords/data/ven.jsonl | 1 + setup.py | 23 +++++++++++++++++++++++ 5 files changed, 25 insertions(+) create mode 100644 our_stopwords/__init__.py create mode 100644 our_stopwords/_main__.py create mode 100644 our_stopwords/data/nso.jsonl create mode 100644 our_stopwords/data/ven.jsonl create mode 100644 setup.py diff --git a/our_stopwords/__init__.py b/our_stopwords/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/our_stopwords/_main__.py b/our_stopwords/_main__.py new file mode 100644 index 0000000..e69de29 diff --git a/our_stopwords/data/nso.jsonl b/our_stopwords/data/nso.jsonl new file mode 100644 index 0000000..df7b659 --- /dev/null +++ b/our_stopwords/data/nso.jsonl @@ -0,0 +1 @@ +{"eng": "with", "ven": "le"} \ No newline at end of file diff --git a/our_stopwords/data/ven.jsonl b/our_stopwords/data/ven.jsonl new file mode 100644 index 0000000..6f2546c --- /dev/null +++ b/our_stopwords/data/ven.jsonl @@ -0,0 +1 @@ +{"eng": "with", "ven": "na"} \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..81bcc50 --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup, find_packages + +setup( + name='our_stopwords', + version='1.0', + packages=find_packages(), + include_package_data=True, + install_requires=[ + 'pandas', + 'scikit-learn' + ], + author='Ndamulelo Nemakhavhani', + author_email='endeesa@yahoo.com', + description='A package for accessing multilingual stop words for South African Bantu Languages.', + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + url='https://github.com/ndamulelonemakh/our-stopwords', + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], +) From 3ac0f11b20558c2f8dba9297ca7766e8c673c01d Mon Sep 17 00:00:00 2001 From: Ndamulelo Nemakhavhani Date: Sat, 1 Jun 2024 15:50:29 +0000 Subject: [PATCH 2/5] update client --- our_stopwords/__main__.py | 48 +++++++++++++++++++++++++++++++++++++++ our_stopwords/_main__.py | 0 2 files changed, 48 insertions(+) create mode 100644 our_stopwords/__main__.py delete mode 100644 our_stopwords/_main__.py diff --git a/our_stopwords/__main__.py b/our_stopwords/__main__.py new file mode 100644 index 0000000..6a6085f --- /dev/null +++ b/our_stopwords/__main__.py @@ -0,0 +1,48 @@ +# our_stopwords/_main__.py + +import os +import json + + +DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +MAIN_REF_URL = "https://raw.githubusercontent.com/ndamulelonemakh/our-stopwords/feature/pip-package/za_stopwords.main.jsonl" + + +def list_available_languages(): + """ + List all available language codes. + + Returns: + - list: List of available language codes. + """ + files = os.listdir(DATA_DIR) + language_codes = [os.path.splitext(file)[0] for file in files if file.endswith('.jsonl')] + return language_codes + + +def get_stopwords(language_code: str): + """ + Retrieve stop words for a specific language. + + Parameters: + - language_code (str): Language code (e.g., 'ven' for Venda). + + Returns: + - list: List of stop words for the specified language. + """ + # Ensure language code is lowercase + language_code = language_code.lower() + + # Check if the language code is valid + valid_codes = list_available_languages() + if language_code not in valid_codes: + raise ValueError(f"Unsupported language code '{language_code}'. Please use one of {valid_codes}.") + + # Load stop words from the JSON lines file + file_path = os.path.join(DATA_DIR, f'{language_code}.jsonl') + with open(file_path, 'r', encoding='utf-8') as file: + stop_words = [] + for line in file: + stop_words.append(json.loads(line.strip())) + + return stop_words diff --git a/our_stopwords/_main__.py b/our_stopwords/_main__.py deleted file mode 100644 index e69de29..0000000 From a97b9ebd8f6c5c5c94cb125873bf310a61a33463 Mon Sep 17 00:00:00 2001 From: Ndamulelo Nemakhavhani Date: Sat, 1 Jun 2024 16:03:58 +0000 Subject: [PATCH 3/5] add cli support --- our_stopwords/__main__.py | 37 ++++++++++++++++++++++++++++ our_stopwords/cli.py | 51 +++++++++++++++++++++++++++++++++++++++ setup.py | 8 ++++++ 3 files changed, 96 insertions(+) create mode 100644 our_stopwords/cli.py diff --git a/our_stopwords/__main__.py b/our_stopwords/__main__.py index 6a6085f..8685a83 100644 --- a/our_stopwords/__main__.py +++ b/our_stopwords/__main__.py @@ -46,3 +46,40 @@ def get_stopwords(language_code: str): stop_words.append(json.loads(line.strip())) return stop_words + + +def cli(): + try: + if args.command == 'list': + available_languages = list_languages() + print("Available languages:") + for lang in available_languages: + print(f" - {lang}") + + elif args.command == 'get': + stopwords = get_stopwords(args.language_code) + for s in stopwords: + print(s) + else: + parser.print_help() + except Exception as e: + print(f"Error: {str(e)}") + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="CLI for accessing multilingual stop words for South African Bantu languages.") + subparsers = parser.add_subparsers(dest='command', title='Commands', description='Valid commands') + list_parser = subparsers.add_parser('list', help='List all available languages') + get_parser = subparsers.add_parser('get', help='Get stop words for a specific language') + get_parser.add_argument('language_code', type=str, help='Language code (e.g., "ven" for Venda)') + + args = parser.parse_args() + cli() + + +"""Usage: + +our_stopwords list +our_stopwords get ven +""" \ No newline at end of file diff --git a/our_stopwords/cli.py b/our_stopwords/cli.py new file mode 100644 index 0000000..e03561a --- /dev/null +++ b/our_stopwords/cli.py @@ -0,0 +1,51 @@ +# our_stopwords/cli.py + +import argparse +import json +import our_stopwords + +def list_languages(): + """ + CLI command to list all available languages. + """ + available_languages = our_stopwords.list_available_languages() + print("Available languages:") + for lang in available_languages: + print(f" - {lang}") + +def get_stopwords(language_code): + """ + CLI command to get stop words for a specific language. + + Parameters: + - language_code (str): Language code (e.g., 'ven' for Venda). + """ + try: + stopwords = our_stopwords.get_stopwords(language_code) + print(json.dumps(stopwords, indent=2, ensure_ascii=False)) + except ValueError as e: + print(f"Error: {str(e)}") + +def main(): + parser = argparse.ArgumentParser(description="CLI for accessing multilingual stop words for African languages.") + + subparsers = parser.add_subparsers(dest='command', title='Commands', description='Valid commands') + + # Subcommand: list + list_parser = subparsers.add_parser('list', help='List all available languages') + + # Subcommand: get + get_parser = subparsers.add_parser('get', help='Get stop words for a specific language') + get_parser.add_argument('language_code', type=str, help='Language code (e.g., "ven" for Venda)') + + args = parser.parse_args() + + if args.command == 'list': + list_languages() + elif args.command == 'get': + get_stopwords(args.language_code) + else: + parser.print_help() + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 81bcc50..2daf5c2 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,9 @@ 'pandas', 'scikit-learn' ], + package_data={ + 'our_stopwords': ['data/*.jsonl'], + }, author='Ndamulelo Nemakhavhani', author_email='endeesa@yahoo.com', description='A package for accessing multilingual stop words for South African Bantu Languages.', @@ -20,4 +23,9 @@ 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', ], + entry_points={ + 'console_scripts': [ + 'our_stopwords = our_stopwords.__main__:cli' + ] + }, ) From 2022c469f20bc095f5e7b2b6ab2bfff26099affd Mon Sep 17 00:00:00 2001 From: Ndamulelo Nemakhavhani Date: Sat, 1 Jun 2024 16:09:44 +0000 Subject: [PATCH 4/5] add actions template --- .github/workflows/deploy.yml | 31 +++++++++++++++++++++++++++++++ our_stopwords/__main__.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/deploy.yml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..5c0c11c --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,31 @@ +name: Deploy + +on: + push: + tags: + - '*' + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel twine + + - name: Build and publish to PyPI + if: startsWith(github.ref, 'refs/tags/') + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + python -m twine upload --repository pypi dist/* diff --git a/our_stopwords/__main__.py b/our_stopwords/__main__.py index 8685a83..90afd83 100644 --- a/our_stopwords/__main__.py +++ b/our_stopwords/__main__.py @@ -5,7 +5,6 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -MAIN_REF_URL = "https://raw.githubusercontent.com/ndamulelonemakh/our-stopwords/feature/pip-package/za_stopwords.main.jsonl" def list_available_languages(): @@ -65,6 +64,7 @@ def cli(): except Exception as e: print(f"Error: {str(e)}") + if __name__ == "__main__": import argparse From ff0c6e7109ef89779c86a31a71c7e18b8211c4fc Mon Sep 17 00:00:00 2001 From: Ndamulelo Nemakhavhani Date: Sat, 1 Jun 2024 16:15:01 +0000 Subject: [PATCH 5/5] update README --- README.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3243c0d..444a8c2 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,58 @@ * We present a list of auto-translated stopwords from English and adapt them to native [South African Bantu Languages](https://pubs.cs.uct.ac.za/id/eprint/1334/1/icadl_2019_banturecognition.pdf) + +## Installation + +You can install the package via pip: + +```bash +pip install -U our-stopwords +``` + +## Quick Start + +* The are two ways to use the installed version of `our-stopwords` + + +### 1. Using as a Python Library + +```python +import our_stopwords + +# List all available languages +available_languages = our_stopwords.list_available_languages() +print("Available languages:", available_languages) + +# Get the list of stopwords for a specific language +stopwords = our_stopwords.get_stopwords('ven') +print(stopwords) +# Output: [{'eng': 'a', 'ven': 'a'}, {'eng': 'about', 'ven': 'nga'}, {'eng': 'after', 'ven': 'mulweli'}, ...] +``` + +### 2. Usage from the CLI + +### List Available Languages + +You can list all available languages supported by the package: + +```bash +our_stopwords list +``` + +### Get Stop Words for a Language + +To get stop words for a specific language, use the following command (replace `ven` with the language code of your choice): + +```bash +our_stopwords get ven +``` + + + + -## Usage +## Manual Usage - The data is provided in [JSON Lines](https://jsonlines.org/) format. Here is an example of using the stopwords in Python: