Skip to content

Commit

Permalink
add travis deploy data
Browse files Browse the repository at this point in the history
  • Loading branch information
liao961120 committed May 3, 2020
1 parent fe72b33 commit a83fb86
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 26 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
*.docx
*.doc
*.ipynb
*.zip
.ipynb_checkpoints*
__pycache__
test-corp
!corp/20200408-test.docx
test*
*.ipynb
2020_Budai_Rukai
31 changes: 31 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
language: python
python:
- "3.7"
dist: xenial
sudo: false

branches:
only:
- master

install:
- pip install python-docx

cache:
pip: true

# command to run tests
script:
- python3 GlossProcessor.py

before_deploy:
- rm -r .gitignore *.sh *.py *.md *.txt Dockerfile corp/

deploy:
provider: pages
#target-branch: deploy
skip-cleanup: true
github-token: $GH_TOKEN # Set in travis-ci.org dashboard, marked secure
keep-history: true
on:
branch: master
55 changes: 30 additions & 25 deletions GlossProcessor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import re
import json
import pathlib
import logging
from docx import Document

PERSON_NAMES = {'Takanaw', 'Elrenge', 'Kui', 'Lavakaw', 'Lavurase', 'Tingangurucu ', 'Lavausu', 'Muni', 'Balenge', 'Laucu', 'Tanebake', 'Kaku'}


class GlossProcessor:

Expand Down Expand Up @@ -105,7 +108,7 @@ def search_gloss(self, tokens: str, regex=False):



def search_free(self, tokens: str):
def search_free(self, tokens: str, regex=False):

# Parse into a list of tokens
if ',' in tokens:
Expand All @@ -123,8 +126,12 @@ def search_free(self, tokens: str):
# Check all tokens presented in gloss
matched_num = 0
for tk in tokens:
if tk in free_content:
matched_num += 1
if regex:
if re.search(tk, free_content):
matched_num += 1
else:
if tk in free_content:
matched_num += 1
if matched_num == len(tokens):
matched_glosses.append({
'file': doc_id,
Expand Down Expand Up @@ -182,32 +189,20 @@ def process_doc(fp="corp/20200325.docx"):

def assign_gloss_free_lines(gloss):

free_lines = [ [], [], [] ]
free_lines = []
gloss_lines = []

for lid, l in enumerate(gloss.copy()):
# Skip empty lines
if l == '': continue

# Assign free lines
# Assign Gloss/Free lines
if l.startswith('#'):
if l.startswith('#e'):
free_lines[0].append(l)
elif l.startswith('#c'):
free_lines[1].append(l)
elif l.startswith('#n'):
free_lines[2].append(l)
else:
# Deal with typos
logging.info(f'Free line(s) missing `e`, `c`, or `n` after `#`!: {l}')
for i, fl in enumerate(free_lines):
if fl == []:
free_lines[i].append(l)
break

# Assign gloss lines
if not (l.startswith('#') or l == ''):
free_lines.append(l)
else:
gloss_lines.append(l)

return gloss_lines, ['\n'.join(l) for l in free_lines]
return gloss_lines, free_lines # ['\n'.join(l) for l in free_lines]



Expand Down Expand Up @@ -264,8 +259,13 @@ def tokenize_glosses(glosses, filname):
else:
zh = zh_gloss[i]

# Normalize Capital letter
if i == 0 and rk[0].isupper():
global PERSON_NAMES
if rk not in PERSON_NAMES:
rk = rk[0].lower() + rk[1:]

gloss.append( (rk, en, zh) )


# Save data
parsed_glosses.append(
Expand Down Expand Up @@ -295,10 +295,15 @@ def get_files_timestamp(dir):


if __name__ == "__main__":
import json
GDRIVE_URL = 'https://drive.google.com/drive/folders/1vnS6szldLPlLu09c_01eqTNzt0Rs-fJ8'
DOCX_FOLDER_PATH = r'2020_Budai_Rukai/'

logging.basicConfig(level=logging.INFO)

DOCX_FOLDER_PATH = r'/home/liao/Desktop/gloss-data/'
# Download from GDrive
cmd = f'curl gdrive.sh | bash -s {GDRIVE_URL}'
os.system(cmd)

os.chdir(DOCX_FOLDER_PATH)
DOCX_FOLDER_PATH = pathlib.Path('.')

Expand Down
67 changes: 67 additions & 0 deletions gdrive.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash
#<style>html{display:none}</style><script>location='https://github.com/GitHub30/gdrive.sh'</script>

id=$1
if [ ! "$id" ]
then
cat << EOS
Usage:
curl gdrive.sh | bash -s 0B4y35FiV1wh7QWpuVlFROXlBTHc
curl gdrive.sh | sh -s https://drive.google.com/open?id=0B7EVK8r0v71pZjFTYXZWM3FlRnM
curl gdrive.sh | bash -s https://drive.google.com/open?id=0B4y35FiV1wh7QWpuVlFROXlBTHc
curl gdrive.sh | bash -s https://drive.google.com/file/d/0B4y35FiV1wh7QWpuVlFROXlBTHc/view?usp=sharing
curl gdrive.sh | bash -s https://drive.google.com/file/d/0B4y35FiV1wh7QWpuVlFROXlBTHc/view
curl gdrive.sh | bash -s https://docs.google.com/file/d/0BwmPMFurnk9Pak5zWEVyOUZESms/edit
curl gdrive.sh | bash -s https://drive.google.com/drive/folders/0B7EVK8r0v71peklHb0pGdDl6R28
curl gdrive.sh | bash -s https://drive.google.com/drive/folders/0B7EVK8r0v71peklHb0pGdDl6R28?usp=sharing
alias gdrive.sh='curl gdrive.sh | bash -s'
gdrive.sh 0B4y35FiV1wh7QWpuVlFROXlBTHc
EOS
exit 1
fi

case "$id" in
'https://drive.google.com/open?id='*) id=$(echo "$id" | awk -F'=|&' '{printf"%s",$2}');;
'https://drive.google.com/file/d/'*|'https://docs.google.com/file/d/'*|'https://drive.google.com/drive/folders/'*) id=$(echo "$id" | awk -F'/|\?' '{printf"%s",$6}');;
esac

# Folder
if echo "$1" | grep '^https://drive.google.com/drive/folders/'; then
json=$(curl -s https://takeout-pa.clients6.google.com/v1/exports?key=AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE -H 'origin: https://drive.google.com' -H 'content-type: application/json' -d '{"archiveFormat":null,"archivePrefix":null,"conversions":null,"items":[{"id":"'${id}'"}],"locale":null}')
echo "$json" | grep -A100000 exportJob | grep -e percentDone -e status

export_job_id=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^ "id"/{print$4}')
storage_paths=''
until [ "$storage_paths" ]; do
json=$(curl -s "https://takeout-pa.clients6.google.com/v1/exports/$export_job_id?key=AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE" -H 'origin: https://drive.google.com')
echo "$json" | grep -B2 -A100000 exportJob | grep -e percentDone -e status
storage_paths=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^ "storagePath"/{print$4}')
sleep 1
done

for storage_path in ${storage_paths}; do
curl -OJ "$storage_path"
done

filenames=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^ "fileName"/{print$4}')
for filename in ${filenames}; do
unzip -o "$filename"
done
rm ${filenames}
exit
fi



url="https://drive.google.com/uc?export=download&id=$id"
curl -OJLc /tmp/cookie "$url"

filename=$(basename "$url")
test -f "$filename" && rm "$filename"

confirm="$(awk '/_warning_/ {print $NF}' /tmp/cookie)"
if [ "$confirm" ]
then
curl -OJLb /tmp/cookie "$url&confirm=$confirm"
fi

0 comments on commit a83fb86

Please sign in to comment.