Skip to content

Commit

Permalink
add audio parse function
Browse files Browse the repository at this point in the history
  • Loading branch information
liao961120 committed Jun 21, 2020
1 parent 2b7e58d commit 820f726
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ branches:
- master

install:
- pip install python-docx
- pip install https://github.com/liao961120/python-docx/archive/master.zip
- pip install chardet
- pip install beautifulsoup4

Expand Down
35 changes: 32 additions & 3 deletions GlossProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pathlib
import logging
from datetime import datetime
from urllib.parse import urlparse
from docx import Document
from bs4 import UnicodeDammit

Expand Down Expand Up @@ -186,12 +187,13 @@ def process_doc(fp="corp/20200325.docx"):
if len(a_doc) == i + 1 or (not a_doc[i + 1].strip().startswith('#')):
end_idx = i + 1
break
glosses_on.append( (gloss_num_old, i) )
glosses_on.append( (gloss_num_old, end_idx) )

# Parse metadata
meta = {
'speaker': '',
'modified': '',
'audio': '',
}
# Get speaker
for line in a_doc:
Expand All @@ -206,7 +208,27 @@ def process_doc(fp="corp/20200325.docx"):
for start, end in glosses_on:
gloss_num = int(re.match("(\d+)\.", a_doc[start])[1])
gloss_lines = [ l.strip() for l in a_doc[(start + 1):end] ]
glosses.append( (gloss_num, gloss_lines, meta) )

# Check audio in one gloss
for i, line in enumerate(gloss_lines.copy()):
if line.startswith('#a'):
# Parse line to audio url
audio_data = line.replace('#a', '').strip().split()

url = urlparse(audio_data[0])
start_time = parse_audio_time(audio_data[1])
if url.netloc != 'drive.google.com' or start_time is None:
logging.warning(f"{str(fp)}#{gloss_num}: invalid audio format")

file_id = url.path.replace('view', '').replace('file', '').replace('d', '').strip('/')
new_url = f"https://drive.google.com/uc?export=open&id={file_id}#t={start_time}"

# Add audio url to meta
meta['audio'] = new_url
break

glosses.append( (gloss_num, gloss_lines, meta.copy()) )
meta['audio'] = ''

return glosses

Expand All @@ -219,7 +241,7 @@ def assign_gloss_free_lines(gloss):

for lid, l in enumerate(gloss.copy()):
# Skip empty lines
if l == '': continue
if l == '' or l.startswith('#a'): continue

# Assign Gloss/Free lines
if l.startswith('#'):
Expand Down Expand Up @@ -314,6 +336,13 @@ def tokenize_glosses(glosses, filname):
return parsed_glosses


def parse_audio_time(time_str: str):
time_str = time_str.strip()
if not re.match('^\d{1,2}:\d{1,2}(:\d{1,2})?$', time_str):
return None
else:
return ':'.join([f.zfill(2) for f in time_str.split(':')])


def get_files_timestamp(dir):
data = {}
Expand Down
35 changes: 35 additions & 0 deletions extractdocx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
"""
Take the path of a docx file as argument, return the text in unicode.
"""
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)

paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(' '.join(texts))

return '\n\n'.join(paragraphs)

0 comments on commit 820f726

Please sign in to comment.