Merge pull request #19 from SUSYUSTC/dev

Dev
SUSYUSTC · Mar 28, 2023 · 3ed53e3 · 3ed53e3
2 parents 1315c53 + 8b5c27c
commit 3ed53e3
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 78 deletions.
diff --git a/mathtranslate/__init__.py b/mathtranslate/__init__.py
@@ -1,12 +1,12 @@
-__version__ = "2.1.2"
+__version__ = "2.1.3"
 __author__ = "Jiace Sun"
 
 import os
 ROOT = os.path.dirname(os.path.abspath(__file__))
 from . import config
 from . import translate
 from . import tencent
-from . import fix_encoding
+from . import encoding
 from . import process_latex
 from . import process_text
 from . import translate_tex
diff --git a/mathtranslate/encoding.py b/mathtranslate/encoding.py
@@ -0,0 +1,9 @@
+import chardet
+
+
+def get_file_encoding(filename):
+    with open(filename, "rb") as f:
+        data = f.read()
+        result = chardet.detect(data)
+        current_encoding = result["encoding"]
+    return current_encoding
diff --git a/mathtranslate/fix_encoding.py b/mathtranslate/fix_encoding.py
diff --git a/mathtranslate/process_latex.py b/mathtranslate/process_latex.py
@@ -42,18 +42,18 @@ def modify_after(text):
     return text
 
 
-def replace_latex_envs(text):
+def replace_latex_objects(text):
     r"""
-    Replaces all LaTeX environments in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
+    Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
     applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts),
-    and returns both the processed text and a list of replaced LaTeX environments.
-    Supported LaTeX environments: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
+    and returns both the processed text and a list of replaced LaTeX objects.
+    Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
     $ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx.
-    Returns the processed text and a list of replaced LaTeX environments.
+    Returns the processed text and a list of replaced LaTeX objects.
     """
 
-    # define regular expressions for each LaTeX environment
-    latex_env_regex = [
+    # define regular expressions for each LaTeX object
+    latex_obj_regex = [
         r"\$\$(.*?)\$\$",  # $$ $$
         r"\$(.*?)\$",  # $ $
         r"\\\[(.*?)\\\]",  # \[ xxx \]
@@ -64,30 +64,30 @@ def replace_latex_envs(text):
         pattern_brace,  # {xxx}
     ]
 
-    # iterate through each LaTeX environment and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
+    # iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
     count = 0
-    replaced_envs = []
-    for regex_symbol in latex_env_regex:
+    replaced_objs = []
+    for regex_symbol in latex_obj_regex:
         pattern = regex.compile(regex_symbol, regex.DOTALL)
         while pattern.search(text):
-            latex_env = pattern.search(text).group()
-            replaced_envs.append(f'{latex_env}')
+            latex_obj = pattern.search(text).group()
+            replaced_objs.append(f'{latex_obj}')
             text = pattern.sub(' ' + variable_code(count) + ' ', text, 1)
             count += 1
 
     text = modify_text(text, modify_before)
-    return text, replaced_envs
+    return text, replaced_objs
 
 
-def recover_latex_envs(text, replaced_envs, final=False):
-    nenvs = len(replaced_envs)
+def recover_latex_objects(text, replaced_objs, final=False):
+    nobjs = len(replaced_objs)
     matched_indices = []
 
-    def get_env(digit_str):
+    def get_obj(digit_str):
         index = int(''.join(digit_str.split('_')))
         matched_indices.append(index)
-        if index < nenvs:
-            return replaced_envs[index]
+        if index < nobjs:
+            return replaced_objs[index]
         else:
             #assert final
             return '???'
@@ -96,16 +96,16 @@ def get_env(digit_str):
     pattern = re.compile(match_code_replace)
     total_num = 0
     while True:
-        text, num_modify = pattern.subn(lambda match: get_env(match.group(1)), text)
+        text, num_modify = pattern.subn(lambda match: get_obj(match.group(1)), text)
         total_num += num_modify
         if num_modify == 0:
             break
-    n_good = len(set(matched_indices).intersection(set(range(nenvs))))
+    n_good = len(set(matched_indices).intersection(set(range(nobjs))))
     n_bad1 = len(matched_indices) - n_good
-    n_bad2 = nenvs - n_good
+    n_bad2 = nobjs - n_good
     n_bad = max(n_bad1, n_bad2)
     if final and n_bad > 0:
-        print(n_bad, 'latex environments are probably wrong in total', nenvs)
+        print(n_bad, 'latex objects are probably wrong in total', nobjs)
     return text
 
 
@@ -116,7 +116,7 @@ def remove_tex_comments(text):
     If "%" is at the beginning of a line then delete this line.
     Returns the processed string.
     """
-    text = re.sub(r"\n(?<!\\)%.*?(?=\n)", "", text)
+    text = re.sub(r"\n\s*(?<!\\)%.*?(?=\n)", "", text)
     text = re.sub(r"(?<!\\)%.*?(?=\n)", "", text)
 
     return text
@@ -137,12 +137,6 @@ def split_latex_document(text, begin_code, end_code):
     return body, pre, post
 
 
-def count_braces(text):
-    text = text.replace(r'\{', '')
-    text = text.replace(r'\}', '')
-    return text.count('{'), text.count('}')
-
-
 def process_specific_env(latex, function, env_names):
     pattern = regex.compile(pattern_env, regex.DOTALL)
 

diff --git a/mathtranslate/translate.py b/mathtranslate/translate.py
@@ -53,13 +53,13 @@ def __init__(self, translator: TextTranslator, debug=False):
         if self.debug:
             self.f_old = open("text_old", "w", encoding='utf-8')
             self.f_new = open("text_new", "w", encoding='utf-8')
-            self.f_env = open("envs", "w", encoding='utf-8')
+            self.f_obj = open("objs", "w", encoding='utf-8')
 
     def close(self):
         if self.debug:
             self.f_old.close()
             self.f_new.close()
-            self.f_env.close()
+            self.f_obj.close()
 
     def translate_paragraph_text(self, text):
         '''
@@ -86,9 +86,9 @@ def translate_paragraph_text(self, text):
 
     def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
         '''
-        Translate a latex paragraph, which means that it could contain latex environments
+        Translate a latex paragraph, which means that it could contain latex objects
         '''
-        text_original_paragraph, envs = process_latex.replace_latex_envs(latex_original_paragraph)
+        text_original_paragraph, objs = process_latex.replace_latex_objects(latex_original_paragraph)
         # Since \n is equivalent to space in latex, we change \n back to space
         # otherwise the translators view them as separate sentences
         text_original_paragraph = text_original_paragraph.replace('\n', '')
@@ -99,24 +99,24 @@ def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
         if self.debug:
             print(f'\n\nParagraph {num}\n\n', file=self.f_old)
             print(f'\n\nParagraph {num}\n\n', file=self.f_new)
-            print(f'\n\nParagraph {num}\n\n', file=self.f_env)
+            print(f'\n\nParagraph {num}\n\n', file=self.f_obj)
             print(text_original_paragraph, file=self.f_old)
             print(text_translated_paragraph, file=self.f_new)
-            for i, env in enumerate(envs):
-                print(f'env {i}', file=self.f_env)
-                print(env, file=self.f_env)
-        latex_translated_paragraph = process_latex.recover_latex_envs(text_translated_paragraph, envs, final=True)
+            for i, obj in enumerate(objs):
+                print(f'obj {i}', file=self.f_obj)
+                print(obj, file=self.f_obj)
+        latex_translated_paragraph = process_latex.recover_latex_objects(text_translated_paragraph, objs, final=True)
         return latex_translated_paragraph
 
     def split_latex_to_paragraphs(self, latex):
         '''
-        1. convert latex to text and environments
+        1. convert latex to text and objects
         2. split text
-        3. convert text back to environments
+        3. convert text back to objects
         '''
-        text, envs = process_latex.replace_latex_envs(latex)
+        text, objs = process_latex.replace_latex_objects(latex)
         paragraphs_text = text.split('\n\n')
-        paragraphs_latex = [process_latex.recover_latex_envs(paragraph_text, envs) for paragraph_text in paragraphs_text]
+        paragraphs_latex = [process_latex.recover_latex_objects(paragraph_text, objs) for paragraph_text in paragraphs_text]
         return paragraphs_latex
 
     def _translate_latex_objects(self, match_function, latex_original, names, complete):
@@ -166,6 +166,7 @@ def translate_full_latex(self, latex_original):
         # It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back
         latex_original = latex_original.replace(r'\{', f'{math_code}LB')
         latex_original = latex_original.replace(r'\}', f'{math_code}RB')
+        latex_original = latex_original.replace(r'\%', f'{math_code}PC')
 
         latex_original_paragraphs = self.split_latex_to_paragraphs(latex_original)
         latex_translated_paragraphs = []
@@ -182,19 +183,21 @@ def translate_full_latex(self, latex_original):
         if self.debug:
             print(latex_translated, file=open('text_after_main.txt', 'w'))
 
-        # TODO: add more environments here
+        # TODO: add more here
         print('processing latex environments')
         latex_translated = self.translate_latex_env(latex_translated, ['abstract', 'acknowledgments', 'itermize', 'enumrate', 'description', 'list'], complete)
         print('processing latex commands')
         latex_translated = self.translate_latex_commands(latex_translated, ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote'], complete)
 
-        print('processing title')
-        latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)
-
         latex_translated = latex_translated.replace(f'{math_code}LB', r'\{')
         latex_translated = latex_translated.replace(f'{math_code}RB', r'\}')
+        latex_translated = latex_translated.replace(f'{math_code}PC', r'\%')
 
         latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end
 
+        # Title is probably outside the body part
+        print('processing title')
+        latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)
+
         self.close()
         return latex_translated
diff --git a/mathtranslate/translate_tex.py b/mathtranslate/translate_tex.py
@@ -73,7 +73,7 @@ def main():
     import mathtranslate
     from mathtranslate import config
     from mathtranslate.config import default_engine, default_language_from, default_language_to
-    from mathtranslate.fix_encoding import fix_file_encoding
+    from mathtranslate.encoding import get_file_encoding
     from mathtranslate.translate import TextTranslator, LatexTranslator
     import argparse
     parser = argparse.ArgumentParser()
@@ -153,18 +153,17 @@ def main():
     text_translator = TextTranslator(options.engine, options.l_to, options.l_from)
     latex_translator = LatexTranslator(text_translator, options.debug)
 
-    text_original = open(input_path).read()
+    input_encoding = get_file_encoding(input_path)
+    text_original = open(input_path, encoding=input_encoding).read()
     text_final = latex_translator.translate_full_latex(text_original)
     with open(output_path, "w", encoding='utf-8') as file:
         print(text_final, file=file)
     print(output_path, 'is generated')
-    fix_file_encoding(output_path)
 
     if options.compile:
         os.system(f'xelatex {output_path}')
     else:
-        print(f"You can then manually compile it by running 'xelatex {output_path}'")
-        print("or compile it online on overleaf")
+        print(f"You can then compile it locally by running 'xelatex {output_path}' or compile it online on overleaf")
 
 
 if __name__ == '__main__':