From 16d34ff47155fa06f851c363d01069fd9ca64226 Mon Sep 17 00:00:00 2001 From: Jiace Sun Date: Tue, 28 Mar 2023 12:27:56 -0700 Subject: [PATCH 1/2] update comments --- mathtranslate/process_latex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py index efbace6..d7bd0f0 100644 --- a/mathtranslate/process_latex.py +++ b/mathtranslate/process_latex.py @@ -116,7 +116,7 @@ def remove_tex_comments(text): If "%" is at the beginning of a line then delete this line. Returns the processed string. """ - text = re.sub(r"\n(? Date: Tue, 28 Mar 2023 15:47:16 -0700 Subject: [PATCH 2/2] update and add comments --- mathtranslate/__init__.py | 4 +-- mathtranslate/encoding.py | 9 ++++++ mathtranslate/fix_encoding.py | 25 ----------------- mathtranslate/process_latex.py | 50 +++++++++++++++------------------- mathtranslate/translate.py | 37 +++++++++++++------------ mathtranslate/translate_tex.py | 9 +++--- 6 files changed, 57 insertions(+), 77 deletions(-) create mode 100644 mathtranslate/encoding.py delete mode 100644 mathtranslate/fix_encoding.py diff --git a/mathtranslate/__init__.py b/mathtranslate/__init__.py index 6593dd3..99a3e33 100644 --- a/mathtranslate/__init__.py +++ b/mathtranslate/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.1.2" +__version__ = "2.1.3" __author__ = "Jiace Sun" import os @@ -6,7 +6,7 @@ from . import config from . import translate from . import tencent -from . import fix_encoding +from . import encoding from . import process_latex from . import process_text from . import translate_tex diff --git a/mathtranslate/encoding.py b/mathtranslate/encoding.py new file mode 100644 index 0000000..f2dacab --- /dev/null +++ b/mathtranslate/encoding.py @@ -0,0 +1,9 @@ +import chardet + + +def get_file_encoding(filename): + with open(filename, "rb") as f: + data = f.read() + result = chardet.detect(data) + current_encoding = result["encoding"] + return current_encoding diff --git a/mathtranslate/fix_encoding.py b/mathtranslate/fix_encoding.py deleted file mode 100644 index 6a3d1e7..0000000 --- a/mathtranslate/fix_encoding.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import chardet - - -def fix_file_encoding(filename, target_encoding="UTF-8"): - """ - Fixes the encoding of a file with Chinese characters. - """ - # TODO: I guess this function is not needed now? - # detect the current encoding of the file - with open(filename, "rb") as f: - data = f.read() - result = chardet.detect(data) - current_encoding = result["encoding"] - if current_encoding is None: - print(f"Error: Could not detect encoding for {filename}") - return - if current_encoding.upper() == target_encoding.upper(): - return - # read the file in the current encoding and write it back in the target encoding - with open(filename, "r", encoding=current_encoding, errors="replace") as f: - text = f.read() - with open(filename, "w", encoding=target_encoding) as f: - f.write(text) - print(f"{filename} encoding fixed from {current_encoding} to {target_encoding}.") diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py index d7bd0f0..ffec42a 100644 --- a/mathtranslate/process_latex.py +++ b/mathtranslate/process_latex.py @@ -42,18 +42,18 @@ def modify_after(text): return text -def replace_latex_envs(text): +def replace_latex_objects(text): r""" - Replaces all LaTeX environments in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}", + Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}", applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts), - and returns both the processed text and a list of replaced LaTeX environments. - Supported LaTeX environments: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$, + and returns both the processed text and a list of replaced LaTeX objects. + Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$, $ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx. - Returns the processed text and a list of replaced LaTeX environments. + Returns the processed text and a list of replaced LaTeX objects. """ - # define regular expressions for each LaTeX environment - latex_env_regex = [ + # define regular expressions for each LaTeX object + latex_obj_regex = [ r"\$\$(.*?)\$\$", # $$ $$ r"\$(.*?)\$", # $ $ r"\\\[(.*?)\\\]", # \[ xxx \] @@ -64,30 +64,30 @@ def replace_latex_envs(text): pattern_brace, # {xxx} ] - # iterate through each LaTeX environment and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}" + # iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}" count = 0 - replaced_envs = [] - for regex_symbol in latex_env_regex: + replaced_objs = [] + for regex_symbol in latex_obj_regex: pattern = regex.compile(regex_symbol, regex.DOTALL) while pattern.search(text): - latex_env = pattern.search(text).group() - replaced_envs.append(f'{latex_env}') + latex_obj = pattern.search(text).group() + replaced_objs.append(f'{latex_obj}') text = pattern.sub(' ' + variable_code(count) + ' ', text, 1) count += 1 text = modify_text(text, modify_before) - return text, replaced_envs + return text, replaced_objs -def recover_latex_envs(text, replaced_envs, final=False): - nenvs = len(replaced_envs) +def recover_latex_objects(text, replaced_objs, final=False): + nobjs = len(replaced_objs) matched_indices = [] - def get_env(digit_str): + def get_obj(digit_str): index = int(''.join(digit_str.split('_'))) matched_indices.append(index) - if index < nenvs: - return replaced_envs[index] + if index < nobjs: + return replaced_objs[index] else: #assert final return '???' @@ -96,16 +96,16 @@ def get_env(digit_str): pattern = re.compile(match_code_replace) total_num = 0 while True: - text, num_modify = pattern.subn(lambda match: get_env(match.group(1)), text) + text, num_modify = pattern.subn(lambda match: get_obj(match.group(1)), text) total_num += num_modify if num_modify == 0: break - n_good = len(set(matched_indices).intersection(set(range(nenvs)))) + n_good = len(set(matched_indices).intersection(set(range(nobjs)))) n_bad1 = len(matched_indices) - n_good - n_bad2 = nenvs - n_good + n_bad2 = nobjs - n_good n_bad = max(n_bad1, n_bad2) if final and n_bad > 0: - print(n_bad, 'latex environments are probably wrong in total', nenvs) + print(n_bad, 'latex objects are probably wrong in total', nobjs) return text @@ -137,12 +137,6 @@ def split_latex_document(text, begin_code, end_code): return body, pre, post -def count_braces(text): - text = text.replace(r'\{', '') - text = text.replace(r'\}', '') - return text.count('{'), text.count('}') - - def process_specific_env(latex, function, env_names): pattern = regex.compile(pattern_env, regex.DOTALL) diff --git a/mathtranslate/translate.py b/mathtranslate/translate.py index ad4a781..ff76adf 100644 --- a/mathtranslate/translate.py +++ b/mathtranslate/translate.py @@ -53,13 +53,13 @@ def __init__(self, translator: TextTranslator, debug=False): if self.debug: self.f_old = open("text_old", "w", encoding='utf-8') self.f_new = open("text_new", "w", encoding='utf-8') - self.f_env = open("envs", "w", encoding='utf-8') + self.f_obj = open("objs", "w", encoding='utf-8') def close(self): if self.debug: self.f_old.close() self.f_new.close() - self.f_env.close() + self.f_obj.close() def translate_paragraph_text(self, text): ''' @@ -86,9 +86,9 @@ def translate_paragraph_text(self, text): def translate_paragraph_latex(self, latex_original_paragraph, num, complete): ''' - Translate a latex paragraph, which means that it could contain latex environments + Translate a latex paragraph, which means that it could contain latex objects ''' - text_original_paragraph, envs = process_latex.replace_latex_envs(latex_original_paragraph) + text_original_paragraph, objs = process_latex.replace_latex_objects(latex_original_paragraph) # Since \n is equivalent to space in latex, we change \n back to space # otherwise the translators view them as separate sentences text_original_paragraph = text_original_paragraph.replace('\n', '') @@ -99,24 +99,24 @@ def translate_paragraph_latex(self, latex_original_paragraph, num, complete): if self.debug: print(f'\n\nParagraph {num}\n\n', file=self.f_old) print(f'\n\nParagraph {num}\n\n', file=self.f_new) - print(f'\n\nParagraph {num}\n\n', file=self.f_env) + print(f'\n\nParagraph {num}\n\n', file=self.f_obj) print(text_original_paragraph, file=self.f_old) print(text_translated_paragraph, file=self.f_new) - for i, env in enumerate(envs): - print(f'env {i}', file=self.f_env) - print(env, file=self.f_env) - latex_translated_paragraph = process_latex.recover_latex_envs(text_translated_paragraph, envs, final=True) + for i, obj in enumerate(objs): + print(f'obj {i}', file=self.f_obj) + print(obj, file=self.f_obj) + latex_translated_paragraph = process_latex.recover_latex_objects(text_translated_paragraph, objs, final=True) return latex_translated_paragraph def split_latex_to_paragraphs(self, latex): ''' - 1. convert latex to text and environments + 1. convert latex to text and objects 2. split text - 3. convert text back to environments + 3. convert text back to objects ''' - text, envs = process_latex.replace_latex_envs(latex) + text, objs = process_latex.replace_latex_objects(latex) paragraphs_text = text.split('\n\n') - paragraphs_latex = [process_latex.recover_latex_envs(paragraph_text, envs) for paragraph_text in paragraphs_text] + paragraphs_latex = [process_latex.recover_latex_objects(paragraph_text, objs) for paragraph_text in paragraphs_text] return paragraphs_latex def _translate_latex_objects(self, match_function, latex_original, names, complete): @@ -166,6 +166,7 @@ def translate_full_latex(self, latex_original): # It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back latex_original = latex_original.replace(r'\{', f'{math_code}LB') latex_original = latex_original.replace(r'\}', f'{math_code}RB') + latex_original = latex_original.replace(r'\%', f'{math_code}PC') latex_original_paragraphs = self.split_latex_to_paragraphs(latex_original) latex_translated_paragraphs = [] @@ -182,19 +183,21 @@ def translate_full_latex(self, latex_original): if self.debug: print(latex_translated, file=open('text_after_main.txt', 'w')) - # TODO: add more environments here + # TODO: add more here print('processing latex environments') latex_translated = self.translate_latex_env(latex_translated, ['abstract', 'acknowledgments', 'itermize', 'enumrate', 'description', 'list'], complete) print('processing latex commands') latex_translated = self.translate_latex_commands(latex_translated, ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote'], complete) - print('processing title') - latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete) - latex_translated = latex_translated.replace(f'{math_code}LB', r'\{') latex_translated = latex_translated.replace(f'{math_code}RB', r'\}') + latex_translated = latex_translated.replace(f'{math_code}PC', r'\%') latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end + # Title is probably outside the body part + print('processing title') + latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete) + self.close() return latex_translated diff --git a/mathtranslate/translate_tex.py b/mathtranslate/translate_tex.py index 96b585c..b22f803 100644 --- a/mathtranslate/translate_tex.py +++ b/mathtranslate/translate_tex.py @@ -73,7 +73,7 @@ def main(): import mathtranslate from mathtranslate import config from mathtranslate.config import default_engine, default_language_from, default_language_to - from mathtranslate.fix_encoding import fix_file_encoding + from mathtranslate.encoding import get_file_encoding from mathtranslate.translate import TextTranslator, LatexTranslator import argparse parser = argparse.ArgumentParser() @@ -153,18 +153,17 @@ def main(): text_translator = TextTranslator(options.engine, options.l_to, options.l_from) latex_translator = LatexTranslator(text_translator, options.debug) - text_original = open(input_path).read() + input_encoding = get_file_encoding(input_path) + text_original = open(input_path, encoding=input_encoding).read() text_final = latex_translator.translate_full_latex(text_original) with open(output_path, "w", encoding='utf-8') as file: print(text_final, file=file) print(output_path, 'is generated') - fix_file_encoding(output_path) if options.compile: os.system(f'xelatex {output_path}') else: - print(f"You can then manually compile it by running 'xelatex {output_path}'") - print("or compile it online on overleaf") + print(f"You can then compile it locally by running 'xelatex {output_path}' or compile it online on overleaf") if __name__ == '__main__':