-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearn.py
363 lines (302 loc) · 15.9 KB
/
learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
import streamlit as st
from firebase_admin import firestore
########################################
# Importing libraries #
########################################
from streamlit_player import st_player
from youtube_transcript_api import YouTubeTranscriptApi
import streamlit as st
import requests
from reverso_context_api import Client
from collections import namedtuple
import re
import streamlit as st
###################################
# Functions #
###################################
# def get_transcription(youtube_url: str, target_language = st.session_state.get("target_language")):
def get_transcription(youtube_url: str):
"""
Extracts the transcription of a YouTube video in the specified target language.
This function retrieves the video ID from the URL, fetches the transcript,
and does not check for existing transcripts in session state as previously
described in comments.
:param youtube_url: A string representing the URL of the YouTube video.
:param target_language: The language code for which to fetch the transcript, defaults to 'fr' for French.
:return: A list of dictionaries, each containing 'text', 'start', and 'duration' keys for the video's transcript.
"""
#defining target_language here
target_language = target_language = st.session_state.get("target_language")
youtube_id = youtube_url[youtube_url.find("watch?v=")+len("watch?v="):]
transcript = YouTubeTranscriptApi.get_transcript(youtube_id,languages=[target_language])
return transcript
def remove_punctuation(input_string):
"""
Remove punctuation from the input string, converting it to lowercase.
:param input_string: A string from which to remove punctuation.
:return: A string with punctuation removed and converted to lowercase.
"""
return re.sub(r'\W+', '', input_string).lower()
#Function to get translations from reverso context (think about replacing/modifying this as it doesnt provide for prronouns and other words)
@st.cache_data
def get_translation(text, native_language=st.session_state.get("native_language","en"), target_language=st.session_state.get("target_language")):
"""
Retrieve translations for a given text using the Reverso Context API.
This function initializes a client for the specified languages and
returns translations for the provided text.
:param text: The word or phrase to translate.
:param native_language: The source language code (default is 'en' for English).
:param target_language: The target language code for translation (default is 'fr' for French).
:return: A list of translation strings for the input text.
"""
client = Client(target_language, native_language)
return list(client.get_translations(text))
@st.cache_data
def import_lesson(youtube_url: str):
"""
Extracts the transcription from a YouTube video URL and returns it.
This function fetches the transcript for educational purposes but does not
update any vocabulary base as previously noted. The caching decorator is used
to potentially improve performance by storing results of expensive operations.
:param youtube_url: A string representing the URL of the YouTube video.
:return: The transcript of the video as returned by `get_transcription`.
"""
transcript = get_transcription(youtube_url)
return transcript
def try_site(url):
"""
Check if the provided URL exists by attempting to access it.
This function sends a GET request to the URL without following redirects
and checks if the response status code is 200, indicating the URL is accessible.
:param url: A string representing the URL to check, typically for a YouTube video.
:return: Boolean, True if the URL returns a 200 status code, False otherwise.
"""
request = requests.get(url, allow_redirects=False)
return request.status_code == 200
def send_unique_words_to_firestore(unique_words, user_id, db, lang_pair):
"""
Send unique words to Firestore, adding new words with a fluency of '1-new'
but leaving existing words' fluency unchanged.
:param unique_words: An iterable of unique words to add to Firestore.
:param user_id: The ID of the user whose vocabulary is being updated.
:param db: A Firestore client instance.
:param lang_pair: String representing the language pair, e.g., "en-fr".
:return: None. Updates Firestore in-place.
"""
words_collection = db.collection('users').document(user_id).collection('vocabulary').document(lang_pair).collection('words')
batch = db.batch()
try:
for word in unique_words:
if isinstance(word, str) and word.strip():
doc_ref = words_collection.document(word)
existing_doc = doc_ref.get()
if not existing_doc.exists: # Check if the word does not exist
# Add new word with fluency set to '1-new'
batch.set(doc_ref, {"fluency": "1-new"})
# If the document exists, we do nothing, preserving its current fluency.
batch.commit()
except Exception as e:
st.text(f"An error occurred while updating Firestore: {str(e)}")
def update_word_fluency(user_id, word, new_fluency, db):
"""
Update the fluency level of a specific word in the user's vocabulary stored in Firestore.
This function checks if the new fluency level is valid before updating
the word's fluency in the user's document in Firestore.
:param user_id: The ID of the user whose vocabulary will be updated.
:param word: The word whose fluency level needs to be updated.
:param new_fluency: The new fluency level to set ('new', 'familiar', or 'known').
:param db: A Firestore client instance used for database operations.
:raises ValueError: If `new_fluency` is not one of the allowed values.
:return: None. This function updates Firestore in-place.
"""
if new_fluency not in ["1-new", "2-recognized", "3-familiar", "4-learned" "5-known"]:
raise ValueError("Fluency must be '1-new', '2-recognized', '3-familiar', '4-learned', or '5-known'")
user_vocabulary_ref = db.collection('users').document(user_id).collection('vocabulary').document('unique_words')
user_vocabulary_ref.update({
f"words.{word}.fluency": new_fluency
})
@st.cache_data
# def batch_get_translations(words, native_language="en", target_language="fr"):
#replacing above with new functionality to select languages from drop down
def batch_get_translations(words, native_language=st.session_state.get("native_language"), target_language=st.session_state.get("target_language")):
"""
Batch translate a set of words from one language to another using Reverso Context API.
This function creates translations for each word in the input list,
caching the results to potentially speed up subsequent calls with the
same input.
:param words: An iterable of words to translate.
:param native_language: The source language code (default is 'en' for English).
:param target_language: The target language code for translation (default is 'fr' for French).
:return: A dictionary where keys are the words from the input and values are lists of translations.
"""
client = Client(target_language, native_language)
return {word: list(client.get_translations(word)) for word in words}
def remove_punctuation(input_string):
"""
Remove punctuation from the input string except for apostrophes, converting it to lowercase.
:param input_string: A string from which to remove punctuation.
:return: A string with most punctuation removed but apostrophes kept, converted to lowercase.
"""
return re.sub(r'[^\w\s\']+', '', input_string).lower()
def process_transcript(transcript, db):
try:
unique_words = set()
for line in transcript:
if line["text"] != '[Music]':
# Keep words with apostrophes intact
cleaned_words = [remove_punctuation(word).lower() for word in line["text"].split()
if isinstance(word, str) and word.strip() != '']
unique_words.update(cleaned_words)
native_language = st.session_state.get("native_language")
target_language = st.session_state.get("target_language")
if st.session_state.get('username'):
lang_pair = f"{native_language}-{target_language}"
send_unique_words_to_firestore(unique_words, st.session_state.username, db, lang_pair)
translations = batch_get_translations([word for word in unique_words if isinstance(word, str) and word],
native_language, target_language)
html_output = []
for line in transcript:
if line["text"] != '[Music]':
words_with_tooltips = []
for word in line["text"].split():
# Clean the word for translation but keep the original for display
cleaned_word = remove_punctuation(word)
if cleaned_word and isinstance(cleaned_word, str):
# Use the cleaned word for translation lookup but display the original word
translation = translations.get(cleaned_word, [])[:3]
words_with_tooltips.append(f'<div class="tooltip">{word}<span class="tooltiptext">{", ".join(translation)}</span></div>')
html_output.append(' '.join(words_with_tooltips))
return '\n'.join(html_output)
except Exception as e:
st.error(f'Error processing script, in process_transcript(): {str(e)}')
return None
def st_player(youtube_url):
"""
Display a YouTube video using Streamlit's video component.
This function simplifies the process of embedding and showing a YouTube video
in a Streamlit application by encapsulating Streamlit's video function.
:param youtube_url: A string containing the URL of the YouTube video to be displayed.
:return: None. This function displays the video directly in the Streamlit app.
"""
st.video(youtube_url)
def app():
"""
Main application function for 'LanguageBuddy', a Streamlit app for language learning.
This function sets up the UI, handles user authentication, processes YouTube URLs
for educational content, and manages the interaction for importing lessons and
displaying video transcripts with translation tooltips.
- Checks if the user is logged in.
- Provides an interface for entering a YouTube URL.
- Manages the process of importing and displaying a lesson from a video by:
- Validating the URL.
- Playing the video.
- Fetching and processing the transcript.
- Displaying the transcript with translation tooltips.
:return: None. This function controls the flow and display of the Streamlit app.
"""
st.markdown("""
<style>
.tooltip {
position: relative;
display: inline-block;
border-bottom: 1px dotted black; /* Optional: Indicates it's hoverable */
}
.tooltip .tooltiptext {
visibility: hidden;
width: 120px;
background-color: black;
color: #fff;
text-align: center;
border-radius: 6px;
padding: 5px 0;
position: absolute;
z-index: 1;
bottom: 125%; /* Tooltip above the word */
left: 50%;
margin-left: -60px; /* Half of width to center the tooltip */
opacity: 0;
transition: opacity 0.3s;
}
.tooltip:hover .tooltiptext {
visibility: visible;
opacity: 1;
}
/* New styles for larger text and increased line spacing */
.transcript {
font-size: 24px; /* Double the typical font size */
line-height: 2; /* Double line height for better readability */
}
</style>
""", unsafe_allow_html=True)
if 'db' not in st.session_state:
st.session_state.db = ''
db=firestore.client()
st.session_state.db=db
if st.session_state.username == '':
st.error("Please log in with a valid username.")
return
else:
with st.expander(':orange[Expand for Instructions]'):
st.markdown("""
In a separate tab of your web browser, open [YouTube](https://www.youtube.com) and find a video to watch.
You can use [Google Translate](https://translate.google.com/) to get the search terms that interest you
in your target language from your native language. The video should be less than 5 minutes long and have
captions in your target language. Use Youtube's Filters -> Duration -> Under 4 minutes.
Copy the URL for the YouTube video and paste it in the box below, then click the **Import Lesson** button.
Videos longer than 5 minutes risk not properly loading the transcript and other LanguageBuddy features.
The shorter a video is, the easier it will be to complete the steps below.
""")
st.subheader(":orange[For the best results]")
st.markdown("""
<ol>
<li>Pick a video less than 5 minutes long. (Novice learners should aim for 1 to 2 minutes)</li>
<li>Watch the entire video without subtitles (or just listen to the audio)</li>
<li>Read the transcription and use the mouse to hover over unknown words, their translation will appear in a tooltip</li>
<li>Rewatch the video with subtitles, or listen to the audio while you read the transcription</li>
<li>Repeat steps 2, 3, and 4 until you understand most of the content (roughly 50 to 80 %)</li>
<li>Commit to doing this exercise at least once a day on LanguageBuddy!</li>
</ol>
</div>
""", unsafe_allow_html=True)
youtube_url = st.text_area(
label=' :orange[Enter the YouTube URL below and click Import Lesson - (video should be less than 5 minutes)]',
placeholder='Enter YouTube URL',
height=None,
max_chars=500
)
if st.button('Import Lesson', use_container_width=True):
if youtube_url != '':
if try_site(youtube_url):
try:
st_player(youtube_url)
transcript = import_lesson(youtube_url)
try:
formatted_transcript = process_transcript(transcript, db)
if formatted_transcript:
# Add a class for styling to the transcript output
st.markdown(f"""
<div class="transcript">
{formatted_transcript}
</div>
""", unsafe_allow_html=True)
else:
script = []
for line in transcript:
text = line["text"]
if text != '[Music]':
script.append(text)
st.text('\n\n'.join(script)) # Double newline for extra space between lines
except Exception as e:
st.error(f'Error processing script, in app(): {str(e)}')
script = []
for line in transcript:
text = line["text"]
if text != '[Music]':
script.append(text)
st.text('\n\n'.join(script)) # Double newline for extra space between lines
except Exception as e:
st.error(f'Error processing video: {str(e)}')
else:
st.error('Invalid YouTube URL or video not accessible.')
else:
st.warning('Please enter a YouTube URL.')