From 16d34ff47155fa06f851c363d01069fd9ca64226 Mon Sep 17 00:00:00 2001
From: Jiace Sun <susyustc@gmail.com>
Date: Tue, 28 Mar 2023 12:27:56 -0700
Subject: [PATCH 1/2] update comments

---
 mathtranslate/process_latex.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py
index efbace6..d7bd0f0 100644
--- a/mathtranslate/process_latex.py
+++ b/mathtranslate/process_latex.py
@@ -116,7 +116,7 @@ def remove_tex_comments(text):
     If "%" is at the beginning of a line then delete this line.
     Returns the processed string.
     """
-    text = re.sub(r"\n(?<!\\)%.*?(?=\n)", "", text)
+    text = re.sub(r"\n\s*(?<!\\)%.*?(?=\n)", "", text)
     text = re.sub(r"(?<!\\)%.*?(?=\n)", "", text)
 
     return text

From 8b5c27ceceebf0dafc4e5e6e5658cb93bff658e4 Mon Sep 17 00:00:00 2001
From: Jiace Sun <susyustc@gmail.com>
Date: Tue, 28 Mar 2023 15:47:16 -0700
Subject: [PATCH 2/2] update and add comments

---
 mathtranslate/__init__.py      |  4 +--
 mathtranslate/encoding.py      |  9 ++++++
 mathtranslate/fix_encoding.py  | 25 -----------------
 mathtranslate/process_latex.py | 50 +++++++++++++++-------------------
 mathtranslate/translate.py     | 37 +++++++++++++------------
 mathtranslate/translate_tex.py |  9 +++---
 6 files changed, 57 insertions(+), 77 deletions(-)
 create mode 100644 mathtranslate/encoding.py
 delete mode 100644 mathtranslate/fix_encoding.py

diff --git a/mathtranslate/__init__.py b/mathtranslate/__init__.py
index 6593dd3..99a3e33 100644
--- a/mathtranslate/__init__.py
+++ b/mathtranslate/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.1.2"
+__version__ = "2.1.3"
 __author__ = "Jiace Sun"
 
 import os
@@ -6,7 +6,7 @@
 from . import config
 from . import translate
 from . import tencent
-from . import fix_encoding
+from . import encoding
 from . import process_latex
 from . import process_text
 from . import translate_tex
diff --git a/mathtranslate/encoding.py b/mathtranslate/encoding.py
new file mode 100644
index 0000000..f2dacab
--- /dev/null
+++ b/mathtranslate/encoding.py
@@ -0,0 +1,9 @@
+import chardet
+
+
+def get_file_encoding(filename):
+    with open(filename, "rb") as f:
+        data = f.read()
+        result = chardet.detect(data)
+        current_encoding = result["encoding"]
+    return current_encoding
diff --git a/mathtranslate/fix_encoding.py b/mathtranslate/fix_encoding.py
deleted file mode 100644
index 6a3d1e7..0000000
--- a/mathtranslate/fix_encoding.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-import chardet
-
-
-def fix_file_encoding(filename, target_encoding="UTF-8"):
-    """
-    Fixes the encoding of a file with Chinese characters.
-    """
-    # TODO: I guess this function is not needed now?
-    # detect the current encoding of the file
-    with open(filename, "rb") as f:
-        data = f.read()
-        result = chardet.detect(data)
-        current_encoding = result["encoding"]
-    if current_encoding is None:
-        print(f"Error: Could not detect encoding for {filename}")
-        return
-    if current_encoding.upper() == target_encoding.upper():
-        return
-    # read the file in the current encoding and write it back in the target encoding
-    with open(filename, "r", encoding=current_encoding, errors="replace") as f:
-        text = f.read()
-    with open(filename, "w", encoding=target_encoding) as f:
-        f.write(text)
-    print(f"{filename} encoding fixed from {current_encoding} to {target_encoding}.")
diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py
index d7bd0f0..ffec42a 100644
--- a/mathtranslate/process_latex.py
+++ b/mathtranslate/process_latex.py
@@ -42,18 +42,18 @@ def modify_after(text):
     return text
 
 
-def replace_latex_envs(text):
+def replace_latex_objects(text):
     r"""
-    Replaces all LaTeX environments in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
+    Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
     applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts),
-    and returns both the processed text and a list of replaced LaTeX environments.
-    Supported LaTeX environments: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
+    and returns both the processed text and a list of replaced LaTeX objects.
+    Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
     $ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx.
-    Returns the processed text and a list of replaced LaTeX environments.
+    Returns the processed text and a list of replaced LaTeX objects.
     """
 
-    # define regular expressions for each LaTeX environment
-    latex_env_regex = [
+    # define regular expressions for each LaTeX object
+    latex_obj_regex = [
         r"\$\$(.*?)\$\$",  # $$ $$
         r"\$(.*?)\$",  # $ $
         r"\\\[(.*?)\\\]",  # \[ xxx \]
@@ -64,30 +64,30 @@ def replace_latex_envs(text):
         pattern_brace,  # {xxx}
     ]
 
-    # iterate through each LaTeX environment and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
+    # iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
     count = 0
-    replaced_envs = []
-    for regex_symbol in latex_env_regex:
+    replaced_objs = []
+    for regex_symbol in latex_obj_regex:
         pattern = regex.compile(regex_symbol, regex.DOTALL)
         while pattern.search(text):
-            latex_env = pattern.search(text).group()
-            replaced_envs.append(f'{latex_env}')
+            latex_obj = pattern.search(text).group()
+            replaced_objs.append(f'{latex_obj}')
             text = pattern.sub(' ' + variable_code(count) + ' ', text, 1)
             count += 1
 
     text = modify_text(text, modify_before)
-    return text, replaced_envs
+    return text, replaced_objs
 
 
-def recover_latex_envs(text, replaced_envs, final=False):
-    nenvs = len(replaced_envs)
+def recover_latex_objects(text, replaced_objs, final=False):
+    nobjs = len(replaced_objs)
     matched_indices = []
 
-    def get_env(digit_str):
+    def get_obj(digit_str):
         index = int(''.join(digit_str.split('_')))
         matched_indices.append(index)
-        if index < nenvs:
-            return replaced_envs[index]
+        if index < nobjs:
+            return replaced_objs[index]
         else:
             #assert final
             return '???'
@@ -96,16 +96,16 @@ def get_env(digit_str):
     pattern = re.compile(match_code_replace)
     total_num = 0
     while True:
-        text, num_modify = pattern.subn(lambda match: get_env(match.group(1)), text)
+        text, num_modify = pattern.subn(lambda match: get_obj(match.group(1)), text)
         total_num += num_modify
         if num_modify == 0:
             break
-    n_good = len(set(matched_indices).intersection(set(range(nenvs))))
+    n_good = len(set(matched_indices).intersection(set(range(nobjs))))
     n_bad1 = len(matched_indices) - n_good
-    n_bad2 = nenvs - n_good
+    n_bad2 = nobjs - n_good
     n_bad = max(n_bad1, n_bad2)
     if final and n_bad > 0:
-        print(n_bad, 'latex environments are probably wrong in total', nenvs)
+        print(n_bad, 'latex objects are probably wrong in total', nobjs)
     return text
 
 
@@ -137,12 +137,6 @@ def split_latex_document(text, begin_code, end_code):
     return body, pre, post
 
 
-def count_braces(text):
-    text = text.replace(r'\{', '')
-    text = text.replace(r'\}', '')
-    return text.count('{'), text.count('}')
-
-
 def process_specific_env(latex, function, env_names):
     pattern = regex.compile(pattern_env, regex.DOTALL)
 
diff --git a/mathtranslate/translate.py b/mathtranslate/translate.py
index ad4a781..ff76adf 100644
--- a/mathtranslate/translate.py
+++ b/mathtranslate/translate.py
@@ -53,13 +53,13 @@ def __init__(self, translator: TextTranslator, debug=False):
         if self.debug:
             self.f_old = open("text_old", "w", encoding='utf-8')
             self.f_new = open("text_new", "w", encoding='utf-8')
-            self.f_env = open("envs", "w", encoding='utf-8')
+            self.f_obj = open("objs", "w", encoding='utf-8')
 
     def close(self):
         if self.debug:
             self.f_old.close()
             self.f_new.close()
-            self.f_env.close()
+            self.f_obj.close()
 
     def translate_paragraph_text(self, text):
         '''
@@ -86,9 +86,9 @@ def translate_paragraph_text(self, text):
 
     def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
         '''
-        Translate a latex paragraph, which means that it could contain latex environments
+        Translate a latex paragraph, which means that it could contain latex objects
         '''
-        text_original_paragraph, envs = process_latex.replace_latex_envs(latex_original_paragraph)
+        text_original_paragraph, objs = process_latex.replace_latex_objects(latex_original_paragraph)
         # Since \n is equivalent to space in latex, we change \n back to space
         # otherwise the translators view them as separate sentences
         text_original_paragraph = text_original_paragraph.replace('\n', '')
@@ -99,24 +99,24 @@ def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
         if self.debug:
             print(f'\n\nParagraph {num}\n\n', file=self.f_old)
             print(f'\n\nParagraph {num}\n\n', file=self.f_new)
-            print(f'\n\nParagraph {num}\n\n', file=self.f_env)
+            print(f'\n\nParagraph {num}\n\n', file=self.f_obj)
             print(text_original_paragraph, file=self.f_old)
             print(text_translated_paragraph, file=self.f_new)
-            for i, env in enumerate(envs):
-                print(f'env {i}', file=self.f_env)
-                print(env, file=self.f_env)
-        latex_translated_paragraph = process_latex.recover_latex_envs(text_translated_paragraph, envs, final=True)
+            for i, obj in enumerate(objs):
+                print(f'obj {i}', file=self.f_obj)
+                print(obj, file=self.f_obj)
+        latex_translated_paragraph = process_latex.recover_latex_objects(text_translated_paragraph, objs, final=True)
         return latex_translated_paragraph
 
     def split_latex_to_paragraphs(self, latex):
         '''
-        1. convert latex to text and environments
+        1. convert latex to text and objects
         2. split text
-        3. convert text back to environments
+        3. convert text back to objects
         '''
-        text, envs = process_latex.replace_latex_envs(latex)
+        text, objs = process_latex.replace_latex_objects(latex)
         paragraphs_text = text.split('\n\n')
-        paragraphs_latex = [process_latex.recover_latex_envs(paragraph_text, envs) for paragraph_text in paragraphs_text]
+        paragraphs_latex = [process_latex.recover_latex_objects(paragraph_text, objs) for paragraph_text in paragraphs_text]
         return paragraphs_latex
 
     def _translate_latex_objects(self, match_function, latex_original, names, complete):
@@ -166,6 +166,7 @@ def translate_full_latex(self, latex_original):
         # It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back
         latex_original = latex_original.replace(r'\{', f'{math_code}LB')
         latex_original = latex_original.replace(r'\}', f'{math_code}RB')
+        latex_original = latex_original.replace(r'\%', f'{math_code}PC')
 
         latex_original_paragraphs = self.split_latex_to_paragraphs(latex_original)
         latex_translated_paragraphs = []
@@ -182,19 +183,21 @@ def translate_full_latex(self, latex_original):
         if self.debug:
             print(latex_translated, file=open('text_after_main.txt', 'w'))
 
-        # TODO: add more environments here
+        # TODO: add more here
         print('processing latex environments')
         latex_translated = self.translate_latex_env(latex_translated, ['abstract', 'acknowledgments', 'itermize', 'enumrate', 'description', 'list'], complete)
         print('processing latex commands')
         latex_translated = self.translate_latex_commands(latex_translated, ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote'], complete)
 
-        print('processing title')
-        latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)
-
         latex_translated = latex_translated.replace(f'{math_code}LB', r'\{')
         latex_translated = latex_translated.replace(f'{math_code}RB', r'\}')
+        latex_translated = latex_translated.replace(f'{math_code}PC', r'\%')
 
         latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end
 
+        # Title is probably outside the body part
+        print('processing title')
+        latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)
+
         self.close()
         return latex_translated
diff --git a/mathtranslate/translate_tex.py b/mathtranslate/translate_tex.py
index 96b585c..b22f803 100644
--- a/mathtranslate/translate_tex.py
+++ b/mathtranslate/translate_tex.py
@@ -73,7 +73,7 @@ def main():
     import mathtranslate
     from mathtranslate import config
     from mathtranslate.config import default_engine, default_language_from, default_language_to
-    from mathtranslate.fix_encoding import fix_file_encoding
+    from mathtranslate.encoding import get_file_encoding
     from mathtranslate.translate import TextTranslator, LatexTranslator
     import argparse
     parser = argparse.ArgumentParser()
@@ -153,18 +153,17 @@ def main():
     text_translator = TextTranslator(options.engine, options.l_to, options.l_from)
     latex_translator = LatexTranslator(text_translator, options.debug)
 
-    text_original = open(input_path).read()
+    input_encoding = get_file_encoding(input_path)
+    text_original = open(input_path, encoding=input_encoding).read()
     text_final = latex_translator.translate_full_latex(text_original)
     with open(output_path, "w", encoding='utf-8') as file:
         print(text_final, file=file)
     print(output_path, 'is generated')
-    fix_file_encoding(output_path)
 
     if options.compile:
         os.system(f'xelatex {output_path}')
     else:
-        print(f"You can then manually compile it by running 'xelatex {output_path}'")
-        print("or compile it online on overleaf")
+        print(f"You can then compile it locally by running 'xelatex {output_path}' or compile it online on overleaf")
 
 
 if __name__ == '__main__':