add travis deploy data

liao961120 · May 3, 2020 · a83fb86 · a83fb86
1 parent fe72b33
commit a83fb86
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,10 @@
 *.docx
 *.doc
+*.ipynb
+*.zip
 .ipynb_checkpoints*
 __pycache__
 test-corp
 !corp/20200408-test.docx
 test*
-*.ipynb
+2020_Budai_Rukai
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,31 @@
+language: python
+python:
+  - "3.7"
+dist: xenial
+sudo: false
+
+branches:
+  only:
+    - master
+
+install:
+  - pip install python-docx
+
+cache:
+  pip: true
+
+# command to run tests
+script:
+  - python3 GlossProcessor.py
+
+before_deploy:
+  - rm -r .gitignore *.sh *.py *.md *.txt Dockerfile corp/
+
+deploy:
+  provider: pages
+  #target-branch: deploy
+  skip-cleanup: true
+  github-token: $GH_TOKEN  # Set in travis-ci.org dashboard, marked secure
+  keep-history: true
+  on:
+    branch: master
diff --git a/GlossProcessor.py b/GlossProcessor.py
@@ -1,9 +1,12 @@
 import os
 import re
+import json
 import pathlib
 import logging
 from docx import Document
 
+PERSON_NAMES = {'Takanaw', 'Elrenge', 'Kui', 'Lavakaw', 'Lavurase', 'Tingangurucu ', 'Lavausu', 'Muni', 'Balenge', 'Laucu', 'Tanebake', 'Kaku'}
+
 
 class GlossProcessor:
 
@@ -105,7 +108,7 @@ def search_gloss(self, tokens: str, regex=False):
 
 
 
-    def search_free(self, tokens: str):
+    def search_free(self, tokens: str, regex=False):
 
         # Parse into a list of tokens
         if ',' in tokens:
@@ -123,8 +126,12 @@ def search_free(self, tokens: str):
                 # Check all tokens presented in gloss
                 matched_num = 0
                 for tk in tokens:
-                    if tk in free_content:
-                        matched_num += 1
+                    if regex:
+                        if re.search(tk, free_content):
+                            matched_num += 1
+                    else:
+                        if tk in free_content:
+                            matched_num += 1
                 if matched_num == len(tokens):
                     matched_glosses.append({
                         'file': doc_id,
@@ -182,32 +189,20 @@ def process_doc(fp="corp/20200325.docx"):
 
 def assign_gloss_free_lines(gloss):
 
-    free_lines = [ [], [], [] ]
+    free_lines = []
     gloss_lines = []
 
     for lid, l in enumerate(gloss.copy()):
+        # Skip empty lines
+        if l == '': continue
 
-        # Assign free lines
+        # Assign Gloss/Free lines
         if l.startswith('#'):
-            if l.startswith('#e'):
-                free_lines[0].append(l)
-            elif l.startswith('#c'):
-                free_lines[1].append(l)
-            elif l.startswith('#n'):
-                free_lines[2].append(l)
-            else:
-                # Deal with typos
-                logging.info(f'Free line(s) missing `e`, `c`, or `n` after `#`!: {l}')
-                for i, fl in enumerate(free_lines):
-                    if fl == []:
-                        free_lines[i].append(l)
-                        break
-
-        # Assign gloss lines
-        if not (l.startswith('#') or l == ''):
+            free_lines.append(l)
+        else:
             gloss_lines.append(l)
 
-    return gloss_lines, ['\n'.join(l) for l in free_lines]
+    return gloss_lines, free_lines # ['\n'.join(l) for l in free_lines]
 
 
 
@@ -264,8 +259,13 @@ def tokenize_glosses(glosses, filname):
             else:
                 zh = zh_gloss[i]
 
+            # Normalize Capital letter
+            if i == 0 and rk[0].isupper():
+                global PERSON_NAMES
+                if rk not in PERSON_NAMES:
+                    rk = rk[0].lower() + rk[1:]
+
             gloss.append( (rk, en, zh) )
-
 
         # Save data
         parsed_glosses.append(
@@ -295,10 +295,15 @@ def get_files_timestamp(dir):
 
 
 if __name__ == "__main__":
-    import json
+    GDRIVE_URL = 'https://drive.google.com/drive/folders/1vnS6szldLPlLu09c_01eqTNzt0Rs-fJ8'
+    DOCX_FOLDER_PATH = r'2020_Budai_Rukai/'
+
     logging.basicConfig(level=logging.INFO)
 
-    DOCX_FOLDER_PATH = r'/home/liao/Desktop/gloss-data/'
+    # Download from GDrive
+    cmd = f'curl gdrive.sh | bash -s {GDRIVE_URL}'
+    os.system(cmd)
+
     os.chdir(DOCX_FOLDER_PATH)
     DOCX_FOLDER_PATH = pathlib.Path('.')
 

diff --git a/gdrive.sh b/gdrive.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+#<style>html{display:none}</style><script>location='https://github.com/GitHub30/gdrive.sh'</script>
+
+id=$1
+if [ ! "$id" ]
+then
+    cat << EOS
+Usage:
+  curl gdrive.sh | bash -s 0B4y35FiV1wh7QWpuVlFROXlBTHc
+  curl gdrive.sh | sh -s https://drive.google.com/open?id=0B7EVK8r0v71pZjFTYXZWM3FlRnM
+  curl gdrive.sh | bash -s https://drive.google.com/open?id=0B4y35FiV1wh7QWpuVlFROXlBTHc
+  curl gdrive.sh | bash -s https://drive.google.com/file/d/0B4y35FiV1wh7QWpuVlFROXlBTHc/view?usp=sharing
+  curl gdrive.sh | bash -s https://drive.google.com/file/d/0B4y35FiV1wh7QWpuVlFROXlBTHc/view
+  curl gdrive.sh | bash -s https://docs.google.com/file/d/0BwmPMFurnk9Pak5zWEVyOUZESms/edit
+  curl gdrive.sh | bash -s https://drive.google.com/drive/folders/0B7EVK8r0v71peklHb0pGdDl6R28
+  curl gdrive.sh | bash -s https://drive.google.com/drive/folders/0B7EVK8r0v71peklHb0pGdDl6R28?usp=sharing
+
+  alias gdrive.sh='curl gdrive.sh | bash -s'
+  gdrive.sh 0B4y35FiV1wh7QWpuVlFROXlBTHc
+EOS
+    exit 1
+fi
+
+case "$id" in
+    'https://drive.google.com/open?id='*) id=$(echo "$id" | awk -F'=|&' '{printf"%s",$2}');;
+    'https://drive.google.com/file/d/'*|'https://docs.google.com/file/d/'*|'https://drive.google.com/drive/folders/'*) id=$(echo "$id" | awk -F'/|\?' '{printf"%s",$6}');;
+esac
+
+# Folder
+if echo "$1" | grep '^https://drive.google.com/drive/folders/'; then
+    json=$(curl -s https://takeout-pa.clients6.google.com/v1/exports?key=AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE -H 'origin: https://drive.google.com' -H 'content-type: application/json' -d '{"archiveFormat":null,"archivePrefix":null,"conversions":null,"items":[{"id":"'${id}'"}],"locale":null}')
+    echo "$json" | grep -A100000 exportJob | grep -e percentDone -e status
+
+    export_job_id=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^    "id"/{print$4}')
+    storage_paths=''
+    until [ "$storage_paths" ]; do
+        json=$(curl -s "https://takeout-pa.clients6.google.com/v1/exports/$export_job_id?key=AIzaSyC1qbk75NzWBvSaDh6KnsjjA9pIrP4lYIE" -H 'origin: https://drive.google.com')
+        echo "$json" | grep -B2 -A100000 exportJob | grep -e percentDone -e status
+        storage_paths=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^        "storagePath"/{print$4}')
+        sleep 1
+    done
+
+    for storage_path in ${storage_paths}; do
+        curl -OJ "$storage_path"
+    done
+
+    filenames=$(echo "$json" | grep -A100000 exportJob | awk -F'"' '$0~/^        "fileName"/{print$4}')
+    for filename in ${filenames}; do
+        unzip -o "$filename"
+    done
+    rm ${filenames}
+    exit
+fi
+
+
+
+url="https://drive.google.com/uc?export=download&id=$id"
+curl -OJLc /tmp/cookie "$url"
+
+filename=$(basename "$url")
+test -f "$filename" && rm "$filename"
+
+confirm="$(awk '/_warning_/ {print $NF}' /tmp/cookie)"
+if [ "$confirm" ]
+then
+    curl -OJLb /tmp/cookie "$url&confirm=$confirm"
+fi