Skip to content

Commit

Permalink
Implementing command-line support for passing multiple files/paths
Browse files Browse the repository at this point in the history
  • Loading branch information
anwala committed Dec 30, 2019
1 parent 0bb79b9 commit 813da7c
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 34 deletions.
10 changes: 10 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.DS_Store
._.DS_Store
__pycache__
.git
.gitignore

Dockerfile
LICENSE
README.md
setup.py
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.7-stretch

LABEL maintainer="Alexander Nwala <[email protected]>"

WORKDIR /home/sumgram

RUN pip install --upgrade pip && pip install sumgram
ENTRYPOINT ["sumgram"]
14 changes: 4 additions & 10 deletions sumgram/sumgram.py
Original file line number Diff line number Diff line change
Expand Up @@ -1347,7 +1347,7 @@ def get_top_sumgrams(doc_dct_lst, n=2, params=None):
def get_args():

parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=30))
parser.add_argument('path', help='Folder path containing input documents or path to single file')
parser.add_argument('path', nargs='+', help='Folder path containing input documents or path to single file or multiple files')

parser.add_argument('-d', '--print-details', help='Print detailed output', action='store_true')
parser.add_argument('-n', '--base-ngram', help='The base n (integer) for generating top sumgrams, if n = 2, bigrams would be the base ngram', type=int, default=2)
Expand Down Expand Up @@ -1507,43 +1507,37 @@ def main():
set_log_defaults(params)
set_logger_dets( params['log_dets'] )

if( params['parallel_readtext'] is True ):
doc_lst = getText(args.path, threadCount=params['thread_count'])
else:
doc_lst = getText(args.path, threadCount=0)

doc_lst = readTextFromFilesRecursive(args.path, addDetails=True)
proc_req(doc_lst, params)

if __name__ == 'sumgram.sumgram':
from sumgram.util import dumpJsonToFile
from sumgram.util import getColorTxt
from sumgram.util import getStopwordsSet
from sumgram.util import genericErrorInfo
from sumgram.util import getText
from sumgram.util import isMatchInOrder
from sumgram.util import nlpIsServerOn
from sumgram.util import nlpSentenceAnnotate
from sumgram.util import nlpServerStartStop
from sumgram.util import overlapFor2Sets
from sumgram.util import parallelTask
from sumgram.util import phraseTokenizer
from sumgram.util import readTextFromFile
from sumgram.util import readTextFromFilesRecursive
from sumgram.util import rmStopwords
from sumgram.util import sortDctByKey
else:
from util import dumpJsonToFile
from util import getColorTxt
from util import getStopwordsSet
from util import genericErrorInfo
from util import getText
from util import isMatchInOrder
from util import nlpIsServerOn
from util import nlpSentenceAnnotate
from util import nlpServerStartStop
from util import overlapFor2Sets
from util import parallelTask
from util import phraseTokenizer
from util import readTextFromFile
from util import readTextFromFilesRecursive
from util import rmStopwords
from util import sortDctByKey

Expand Down
131 changes: 107 additions & 24 deletions sumgram/util.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import gzip
import json
import logging
import os
import re
import requests
import sys
import tarfile

from subprocess import check_output, CalledProcessError
from multiprocessing import Pool
Expand Down Expand Up @@ -353,21 +355,120 @@ def dumpJsonToFile(outfilename, dictToWrite, indentFlag=True, extraParams=None):
if( extraParams['verbose'] ):
logger.info('\twriteTextToFile(), wrote: ' + outfilename)
except:
genericErrorInfo('\terror: outfilename: ' + outfilename)
genericErrorInfo('\n\terror: outfilename: ' + outfilename)

def readTextFromFile(infilename):

text = ''
def getTextFromGZ(path):

try:
with gzip.open(path, 'rb') as f:
return f.read().decode('utf-8')
except:
genericErrorInfo()

return ''

def readTextFromTar(filename, addDetails=True):

payload = []
try:
tar = tarfile.open(filename, 'r:*')

for tarinfo in tar.getmembers():
if tarinfo.isreg():

try:
f = tar.extractfile(tarinfo)
text = f.read()

if( tarinfo.name.endswith('.gz') ):
text = gzip.decompress(text)

text = text.decode('utf-8')
if( text != '' ):
if( addDetails is True ):
extra = {'src': filename}
text = getTextDetails( filename=os.path.basename(tarinfo.name), text=text, extra=extra )

payload.append(text)

except UnicodeDecodeError as e:
logger.error('\nreadTextFromTar(), UnicodeDecodeError file: ' + tarinfo.name)
except:
genericErrorInfo('\n\treadTextFromTar(), Error reading file: ' + tarinfo.name)

tar.close()
except:
genericErrorInfo()

return payload

def readTextFromFile(infilename):

try:
with open(infilename, 'r') as infile:
text = infile.read()
return infile.read()
except:
genericErrorInfo('\treadTextFromFile() error filename: ' + infilename)
genericErrorInfo( '\n\treadTextFromFile(), error filename: ' + infilename )

return ''

def getTextDetails(filename, text, extra=None):

if( extra is None ):
extra = {}

payload = {'filename': filename, 'text': text}

for key, val in extra.items():
payload[key] = val

return payload

def readTextFromFilesRecursive(files, addDetails=True):

return text
if( isinstance(files, str) ):
files = [files]

if( isinstance(files, list) is False ):
return []

result = []
for f in files:

f = f.strip()

if( f.endswith('.tar') or f.endswith('.tar.gz') ):
result += readTextFromTar(f, addDetails=addDetails)

elif( f.endswith('.gz') ):

text = getTextFromGZ(f)
if( text != '' ):
if( addDetails is True ):
text = getTextDetails(filename=f, text=text)

result.append(text)

elif( os.path.isfile(f) ):

text = readTextFromFile(f)
if( text != '' ):
if( addDetails is True ):
text = getTextDetails(filename=f, text=text)

result.append(text)

elif( os.path.isdir(f) ):

if( f.endswith('/') is False ):
f = f + '/'

secondLevelFiles = os.listdir(f)
secondLevelFiles = [f + f2 for f2 in secondLevelFiles]
result += readTextFromFilesRecursive(secondLevelFiles)

return result
#nlp server - start

def nlpIsServerOn(addr='http://localhost:9000'):
Expand Down Expand Up @@ -613,21 +714,3 @@ def sequentialGetTxt(folder):

def getColorTxt(txt, ansiCode='91m'):
return '\033[' + ansiCode + '{}\033[00m'.format(txt)

def getText(path, threadCount=5):

docLst = []

if( os.path.isdir(path) ):

if( threadCount > 0 ):
docLst = parallelGetTxt(path, threadCount=threadCount)
else:
docLst = sequentialGetTxt(path)

else:
docLst = [{
'text': readTextFromFile(path)
}]

return docLst

0 comments on commit 813da7c

Please sign in to comment.