Skip to content

Commit

Permalink
Merge pull request #19 from SUSYUSTC/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
SUSYUSTC authored Mar 28, 2023
2 parents 1315c53 + 8b5c27c commit 3ed53e3
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 78 deletions.
4 changes: 2 additions & 2 deletions mathtranslate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
__version__ = "2.1.2"
__version__ = "2.1.3"
__author__ = "Jiace Sun"

import os
ROOT = os.path.dirname(os.path.abspath(__file__))
from . import config
from . import translate
from . import tencent
from . import fix_encoding
from . import encoding
from . import process_latex
from . import process_text
from . import translate_tex
9 changes: 9 additions & 0 deletions mathtranslate/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import chardet


def get_file_encoding(filename):
with open(filename, "rb") as f:
data = f.read()
result = chardet.detect(data)
current_encoding = result["encoding"]
return current_encoding
25 changes: 0 additions & 25 deletions mathtranslate/fix_encoding.py

This file was deleted.

52 changes: 23 additions & 29 deletions mathtranslate/process_latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ def modify_after(text):
return text


def replace_latex_envs(text):
def replace_latex_objects(text):
r"""
Replaces all LaTeX environments in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
Replaces all LaTeX objects in a given text with the format "XMATH_{digit1}_{digit2}_..._{digit_last}",
applies a given function to the resulting text (excluding the "XMATH_{digit1}_{digit2}_..._{digit_last}" parts),
and returns both the processed text and a list of replaced LaTeX environments.
Supported LaTeX environments: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
and returns both the processed text and a list of replaced LaTeX objects.
Supported LaTeX objects: \[ xxx \], \begin{xxx} \end{xxx}, $$ $$,
$ $, \( xxx \), \xxx[xxx]{xxx}, \xxx{xxx}, and \xxx.
Returns the processed text and a list of replaced LaTeX environments.
Returns the processed text and a list of replaced LaTeX objects.
"""

# define regular expressions for each LaTeX environment
latex_env_regex = [
# define regular expressions for each LaTeX object
latex_obj_regex = [
r"\$\$(.*?)\$\$", # $$ $$
r"\$(.*?)\$", # $ $
r"\\\[(.*?)\\\]", # \[ xxx \]
Expand All @@ -64,30 +64,30 @@ def replace_latex_envs(text):
pattern_brace, # {xxx}
]

# iterate through each LaTeX environment and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
# iterate through each LaTeX object and replace with "XMATH_{digit1}_{digit2}_..._{digit_last}"
count = 0
replaced_envs = []
for regex_symbol in latex_env_regex:
replaced_objs = []
for regex_symbol in latex_obj_regex:
pattern = regex.compile(regex_symbol, regex.DOTALL)
while pattern.search(text):
latex_env = pattern.search(text).group()
replaced_envs.append(f'{latex_env}')
latex_obj = pattern.search(text).group()
replaced_objs.append(f'{latex_obj}')
text = pattern.sub(' ' + variable_code(count) + ' ', text, 1)
count += 1

text = modify_text(text, modify_before)
return text, replaced_envs
return text, replaced_objs


def recover_latex_envs(text, replaced_envs, final=False):
nenvs = len(replaced_envs)
def recover_latex_objects(text, replaced_objs, final=False):
nobjs = len(replaced_objs)
matched_indices = []

def get_env(digit_str):
def get_obj(digit_str):
index = int(''.join(digit_str.split('_')))
matched_indices.append(index)
if index < nenvs:
return replaced_envs[index]
if index < nobjs:
return replaced_objs[index]
else:
#assert final
return '???'
Expand All @@ -96,16 +96,16 @@ def get_env(digit_str):
pattern = re.compile(match_code_replace)
total_num = 0
while True:
text, num_modify = pattern.subn(lambda match: get_env(match.group(1)), text)
text, num_modify = pattern.subn(lambda match: get_obj(match.group(1)), text)
total_num += num_modify
if num_modify == 0:
break
n_good = len(set(matched_indices).intersection(set(range(nenvs))))
n_good = len(set(matched_indices).intersection(set(range(nobjs))))
n_bad1 = len(matched_indices) - n_good
n_bad2 = nenvs - n_good
n_bad2 = nobjs - n_good
n_bad = max(n_bad1, n_bad2)
if final and n_bad > 0:
print(n_bad, 'latex environments are probably wrong in total', nenvs)
print(n_bad, 'latex objects are probably wrong in total', nobjs)
return text


Expand All @@ -116,7 +116,7 @@ def remove_tex_comments(text):
If "%" is at the beginning of a line then delete this line.
Returns the processed string.
"""
text = re.sub(r"\n(?<!\\)%.*?(?=\n)", "", text)
text = re.sub(r"\n\s*(?<!\\)%.*?(?=\n)", "", text)
text = re.sub(r"(?<!\\)%.*?(?=\n)", "", text)

return text
Expand All @@ -137,12 +137,6 @@ def split_latex_document(text, begin_code, end_code):
return body, pre, post


def count_braces(text):
text = text.replace(r'\{', '')
text = text.replace(r'\}', '')
return text.count('{'), text.count('}')


def process_specific_env(latex, function, env_names):
pattern = regex.compile(pattern_env, regex.DOTALL)

Expand Down
37 changes: 20 additions & 17 deletions mathtranslate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ def __init__(self, translator: TextTranslator, debug=False):
if self.debug:
self.f_old = open("text_old", "w", encoding='utf-8')
self.f_new = open("text_new", "w", encoding='utf-8')
self.f_env = open("envs", "w", encoding='utf-8')
self.f_obj = open("objs", "w", encoding='utf-8')

def close(self):
if self.debug:
self.f_old.close()
self.f_new.close()
self.f_env.close()
self.f_obj.close()

def translate_paragraph_text(self, text):
'''
Expand All @@ -86,9 +86,9 @@ def translate_paragraph_text(self, text):

def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
'''
Translate a latex paragraph, which means that it could contain latex environments
Translate a latex paragraph, which means that it could contain latex objects
'''
text_original_paragraph, envs = process_latex.replace_latex_envs(latex_original_paragraph)
text_original_paragraph, objs = process_latex.replace_latex_objects(latex_original_paragraph)
# Since \n is equivalent to space in latex, we change \n back to space
# otherwise the translators view them as separate sentences
text_original_paragraph = text_original_paragraph.replace('\n', '')
Expand All @@ -99,24 +99,24 @@ def translate_paragraph_latex(self, latex_original_paragraph, num, complete):
if self.debug:
print(f'\n\nParagraph {num}\n\n', file=self.f_old)
print(f'\n\nParagraph {num}\n\n', file=self.f_new)
print(f'\n\nParagraph {num}\n\n', file=self.f_env)
print(f'\n\nParagraph {num}\n\n', file=self.f_obj)
print(text_original_paragraph, file=self.f_old)
print(text_translated_paragraph, file=self.f_new)
for i, env in enumerate(envs):
print(f'env {i}', file=self.f_env)
print(env, file=self.f_env)
latex_translated_paragraph = process_latex.recover_latex_envs(text_translated_paragraph, envs, final=True)
for i, obj in enumerate(objs):
print(f'obj {i}', file=self.f_obj)
print(obj, file=self.f_obj)
latex_translated_paragraph = process_latex.recover_latex_objects(text_translated_paragraph, objs, final=True)
return latex_translated_paragraph

def split_latex_to_paragraphs(self, latex):
'''
1. convert latex to text and environments
1. convert latex to text and objects
2. split text
3. convert text back to environments
3. convert text back to objects
'''
text, envs = process_latex.replace_latex_envs(latex)
text, objs = process_latex.replace_latex_objects(latex)
paragraphs_text = text.split('\n\n')
paragraphs_latex = [process_latex.recover_latex_envs(paragraph_text, envs) for paragraph_text in paragraphs_text]
paragraphs_latex = [process_latex.recover_latex_objects(paragraph_text, objs) for paragraph_text in paragraphs_text]
return paragraphs_latex

def _translate_latex_objects(self, match_function, latex_original, names, complete):
Expand Down Expand Up @@ -166,6 +166,7 @@ def translate_full_latex(self, latex_original):
# It is difficult for regex to exclude \{ during match so I replace it to something else and then replace back
latex_original = latex_original.replace(r'\{', f'{math_code}LB')
latex_original = latex_original.replace(r'\}', f'{math_code}RB')
latex_original = latex_original.replace(r'\%', f'{math_code}PC')

latex_original_paragraphs = self.split_latex_to_paragraphs(latex_original)
latex_translated_paragraphs = []
Expand All @@ -182,19 +183,21 @@ def translate_full_latex(self, latex_original):
if self.debug:
print(latex_translated, file=open('text_after_main.txt', 'w'))

# TODO: add more environments here
# TODO: add more here
print('processing latex environments')
latex_translated = self.translate_latex_env(latex_translated, ['abstract', 'acknowledgments', 'itermize', 'enumrate', 'description', 'list'], complete)
print('processing latex commands')
latex_translated = self.translate_latex_commands(latex_translated, ['section', 'subsection', 'subsubsection', 'caption', 'subcaption', 'footnote'], complete)

print('processing title')
latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)

latex_translated = latex_translated.replace(f'{math_code}LB', r'\{')
latex_translated = latex_translated.replace(f'{math_code}RB', r'\}')
latex_translated = latex_translated.replace(f'{math_code}PC', r'\%')

latex_translated = tex_begin + '\n' + latex_translated + '\n' + tex_end

# Title is probably outside the body part
print('processing title')
latex_translated = self.translate_latex_commands(latex_translated, ['title'], complete)

self.close()
return latex_translated
9 changes: 4 additions & 5 deletions mathtranslate/translate_tex.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def main():
import mathtranslate
from mathtranslate import config
from mathtranslate.config import default_engine, default_language_from, default_language_to
from mathtranslate.fix_encoding import fix_file_encoding
from mathtranslate.encoding import get_file_encoding
from mathtranslate.translate import TextTranslator, LatexTranslator
import argparse
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -153,18 +153,17 @@ def main():
text_translator = TextTranslator(options.engine, options.l_to, options.l_from)
latex_translator = LatexTranslator(text_translator, options.debug)

text_original = open(input_path).read()
input_encoding = get_file_encoding(input_path)
text_original = open(input_path, encoding=input_encoding).read()
text_final = latex_translator.translate_full_latex(text_original)
with open(output_path, "w", encoding='utf-8') as file:
print(text_final, file=file)
print(output_path, 'is generated')
fix_file_encoding(output_path)

if options.compile:
os.system(f'xelatex {output_path}')
else:
print(f"You can then manually compile it by running 'xelatex {output_path}'")
print("or compile it online on overleaf")
print(f"You can then compile it locally by running 'xelatex {output_path}' or compile it online on overleaf")


if __name__ == '__main__':
Expand Down

0 comments on commit 3ed53e3

Please sign in to comment.