diff --git a/setup.py b/setup.py index eb6db8b75..b53131194 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ packages=[ f"texar.{name}" - for name in setuptools.find_packages(where='texar/') + for name in setuptools.find_packages(where='texar') ], platforms='any', diff --git a/texar/torch/data/data/text_data_base.py b/texar/torch/data/data/text_data_base.py index a24f7bd0d..e579186df 100644 --- a/texar/torch/data/data/text_data_base.py +++ b/texar/torch/data/data/text_data_base.py @@ -56,7 +56,7 @@ class TextLineDataSource(DataSource[List[str]]): def __init__(self, file_paths: MaybeList[str], compression_type: Optional[str] = None, - encoding: Optional[str] = None, + encoding: Optional[str] = "utf-8", delimiter: Optional[str] = None, max_length: Optional[int] = None): if compression_type is not None: diff --git a/texar/torch/data/data_utils.py b/texar/torch/data/data_utils.py index c9ed140f8..dfc2e35eb 100644 --- a/texar/torch/data/data_utils.py +++ b/texar/torch/data/data_utils.py @@ -195,7 +195,7 @@ def read_words(filename: str, newline_token: Optional[str] = None) -> List[str]: Returns: A list of words. """ - with open(filename, "r") as f: + with open(filename, "r", encoding="utf-8") as f: if Py3: if newline_token is None: return f.read().split() diff --git a/texar/torch/data/vocabulary.py b/texar/torch/data/vocabulary.py index 3ca20fa83..b8a17aaf5 100644 --- a/texar/torch/data/vocabulary.py +++ b/texar/torch/data/vocabulary.py @@ -115,7 +115,7 @@ def load(self, filename: str) \ where and :attr:`token_to_id_map_py` are python `defaultdict` instances. """ - with open(filename, "r") as vocab_file: + with open(filename, "r", encoding="utf-8") as vocab_file: vocab = list(line.strip() for line in vocab_file) warnings.simplefilter("ignore", UnicodeWarning)