forked from irit-melodi/irit-stac
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-3rd-party
executable file
·88 lines (75 loc) · 3.02 KB
/
run-3rd-party
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Eric Kow
# License: BSD3
"""
Run third party tools on the unannotated data we have.
So far:
* Supplying --ark-tweet-nlp (jar file) will
run this CMU tagger on all EDUs in the documents
* Supplying --corenlp (dir) will run the Stanford
CoreNLP pipeline on all the turns
"""
import argparse
import sys
from educe import util
from educe.stac import postag, corenlp
import educe.stac
from stac.harness.corenlp import ServerConfig
import stac.harness.corenlp as corenlp_server
# ---------------------------------------------------------------------
# args
# ---------------------------------------------------------------------
arg_parser = argparse.ArgumentParser(description='Dump EDU text' )
arg_parser.add_argument('idir', metavar='DIR',
help='Input directory'
)
arg_parser.add_argument('odir', metavar='DIR',
help='Output directory'
)
arg_parser.add_argument('--ark-tweet-nlp', metavar='FILE',
help='Path to ark-tweet-nlp jar file'
)
arg_parser.add_argument('--corenlp', metavar='DIR',
help='Path to CoreNLP directory'
)
arg_parser.add_argument('--corenlp-server', metavar='DIR',
help='Launch/connect to CoreNLP server')
arg_parser.add_argument('--corenlp-address',
default='tcp://localhost:5900',
help='Address of server (use w corenlp-server)')
arg_parser.add_argument('--live',
action='store_const',
const=True,
help='"Live" data (not the annotated corpus)')
educe_group = arg_parser.add_argument_group('corpus filtering arguments')
util.add_corpus_filters(educe_group, fields=[ 'doc', 'subdoc' ])
args=arg_parser.parse_args()
args.stage = 'unannotated'
args.annotator = None
is_interesting=util.mk_is_interesting(args)
pipelines = [args.ark_tweet_nlp, args.corenlp, args.corenlp_server]
if all(t is None for t in pipelines):
print >> sys.stderr, "At least one pipeline must be specified"
print >> sys.stderr, "See the --help option"
sys.exit(1)
# ---------------------------------------------------------------------
# main
# ---------------------------------------------------------------------
if args.live:
reader = educe.stac.LiveInputReader(args.idir)
anno_files = reader.files()
else:
reader = educe.stac.Reader(args.idir)
anno_files = reader.filter(reader.files(), is_interesting)
corpus = reader.slurp(anno_files, verbose=True)
if args.ark_tweet_nlp:
postag.run_tagger(corpus, args.odir, args.ark_tweet_nlp)
if args.corenlp_server:
config = ServerConfig(address=args.corenlp_address,
directory=args.corenlp_server,
output=sys.stderr)
corenlp_server.run_pipeline(corpus, args.odir, config)
elif args.corenlp:
corenlp.run_pipeline(corpus, args.odir, args.corenlp)