add audio parse function

liao961120 · Jun 21, 2020 · 820f726 · 820f726
1 parent 2b7e58d
commit 820f726
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 4 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,7 +9,7 @@ branches:
     - master
 
 install:
-  - pip install python-docx
+  - pip install https://github.com/liao961120/python-docx/archive/master.zip
   - pip install chardet
   - pip install beautifulsoup4
 

diff --git a/GlossProcessor.py b/GlossProcessor.py
@@ -5,6 +5,7 @@
 import pathlib
 import logging
 from datetime import datetime
+from urllib.parse import urlparse
 from docx import Document
 from bs4 import UnicodeDammit
 
@@ -186,12 +187,13 @@ def process_doc(fp="corp/20200325.docx"):
             if len(a_doc) == i + 1 or (not a_doc[i + 1].strip().startswith('#')):
                 end_idx = i + 1
                 break
-    glosses_on.append( (gloss_num_old, i) )
+    glosses_on.append( (gloss_num_old, end_idx) )
 
     # Parse metadata
     meta = {
         'speaker': '',
         'modified': '',
+        'audio': '',
     }
     # Get speaker
     for line in a_doc:
@@ -206,7 +208,27 @@ def process_doc(fp="corp/20200325.docx"):
     for start, end in glosses_on:
         gloss_num = int(re.match("(\d+)\.", a_doc[start])[1])
         gloss_lines = [ l.strip() for l in a_doc[(start + 1):end] ]
-        glosses.append( (gloss_num, gloss_lines, meta) )
+
+        # Check audio in one gloss
+        for i, line in enumerate(gloss_lines.copy()):
+            if line.startswith('#a'):
+                # Parse line to audio url
+                audio_data = line.replace('#a', '').strip().split()
+
+                url = urlparse(audio_data[0])
+                start_time = parse_audio_time(audio_data[1])
+                if url.netloc != 'drive.google.com' or start_time is None:
+                    logging.warning(f"{str(fp)}#{gloss_num}: invalid audio format")
+
+                file_id = url.path.replace('view', '').replace('file', '').replace('d', '').strip('/')
+                new_url = f"https://drive.google.com/uc?export=open&id={file_id}#t={start_time}"
+
+                # Add audio url to meta
+                meta['audio'] = new_url
+                break
+
+        glosses.append( (gloss_num, gloss_lines, meta.copy()) )
+        meta['audio'] = ''
 
     return glosses
 
@@ -219,7 +241,7 @@ def assign_gloss_free_lines(gloss):
 
     for lid, l in enumerate(gloss.copy()):
         # Skip empty lines
-        if l == '': continue
+        if l == '' or l.startswith('#a'): continue
 
         # Assign Gloss/Free lines
         if l.startswith('#'):
@@ -314,6 +336,13 @@ def tokenize_glosses(glosses, filname):
     return parsed_glosses
 
 
+def parse_audio_time(time_str: str):
+    time_str = time_str.strip()
+    if not re.match('^\d{1,2}:\d{1,2}(:\d{1,2})?$', time_str):
+        return None
+    else:
+        return ':'.join([f.zfill(2) for f in time_str.split(':')])
+
 
 def get_files_timestamp(dir):
     data = {}

diff --git a/extractdocx.py b/extractdocx.py
@@ -0,0 +1,35 @@
+try:
+    from xml.etree.cElementTree import XML
+except ImportError:
+    from xml.etree.ElementTree import XML
+import zipfile
+
+
+"""
+Module that extract text from MS XML Word document (.docx).
+(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
+"""
+
+WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
+PARA = WORD_NAMESPACE + 'p'
+TEXT = WORD_NAMESPACE + 't'
+
+
+def get_docx_text(path):
+    """
+    Take the path of a docx file as argument, return the text in unicode.
+    """
+    document = zipfile.ZipFile(path)
+    xml_content = document.read('word/document.xml')
+    document.close()
+    tree = XML(xml_content)
+
+    paragraphs = []
+    for paragraph in tree.getiterator(PARA):
+        texts = [node.text
+                 for node in paragraph.getiterator(TEXT)
+                 if node.text]
+        if texts:
+            paragraphs.append(' '.join(texts))
+
+    return '\n\n'.join(paragraphs)