-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslation.py
86 lines (59 loc) · 2.52 KB
/
translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import pandas as pd
from tqdm.auto import tqdm
from deep_translator import GoogleTranslator
import logging
import multiprocessing as mp
import time
LOG_FOLDER = "logs"
os.makedirs(LOG_FOLDER, exist_ok=True)
logging.basicConfig(
filename=os.path.join(LOG_FOLDER, 'translation_log.log'),
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
)
BATCH_SIZE = 1000
PARTS_FOLDER = "data_parts"
TRANSLATED_FOLDER = "translated_parts"
os.makedirs(PARTS_FOLDER, exist_ok=True)
os.makedirs(TRANSLATED_FOLDER, exist_ok=True)
def translate_row(row):
"""Translate a single row with a delay to avoid rate limits."""
text, lang = row['text'], row['lang']
try:
time.sleep(0.25)
return GoogleTranslator(source=lang, target='en').translate(text)
except Exception as e:
logging.error(f"Translation failed for lang={lang}: {e}")
return 'not_translated'
def split_data(input_file, num_parts=100):
"""Split the data into `num_parts` and save as separate files."""
data = pd.read_pickle(input_file)
non_eng_tweets = data[data['lang'] != 'en']
non_eng_tweets = non_eng_tweets.reset_index(drop=True)
part_size = len(non_eng_tweets) // num_parts + 1
for i in tqdm(range(0, len(non_eng_tweets), part_size), desc="Splitting Data"):
part = non_eng_tweets.iloc[i:i + part_size].reset_index(drop=True)
part_file = os.path.join(PARTS_FOLDER, f'part_{i // part_size}.pkl')
part.to_pickle(part_file)
logging.info(f"Saved part: {part_file}")
def process_part(part_file):
"""Process a single part and save translated results."""
part = pd.read_pickle(part_file)
if 'translated' not in part.columns:
part['translated'] = ''
with mp.Pool(mp.cpu_count()) as pool:
translated_texts = pool.map(translate_row, part.to_dict('records'))
part['translated'] = translated_texts
translated_file = os.path.join(TRANSLATED_FOLDER, os.path.basename(part_file))
part.to_pickle(translated_file)
logging.info(f"Translated part saved: {translated_file}")
def process_all_parts():
part_files = [os.path.join(PARTS_FOLDER, f) for f in os.listdir(PARTS_FOLDER) if f.endswith('.pkl')]
for part_file in tqdm(part_files, desc="Processing Parts"):
process_part(part_file)
logging.info("All parts translated and saved.")
if __name__ == "__main__":
INPUT_FILE = '/Users/deryadurmush/Desktop/Jobs /balenciaga/balenciaga_scarped.pkl'
split_data(INPUT_FILE, num_parts=100)
process_all_parts()