run-3rd-party

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Eric Kow
# License: BSD3

"""
Run third party tools on the unannotated data we have.
So far:

  * Supplying --ark-tweet-nlp (jar file) will
    run this CMU tagger on all EDUs in the documents

  * Supplying --corenlp (dir) will run the Stanford
    CoreNLP pipeline on all the turns
"""

import argparse
import sys

from educe import util
from educe.stac import postag, corenlp
import educe.stac

from stac.harness.corenlp import ServerConfig
import stac.harness.corenlp as corenlp_server

# ---------------------------------------------------------------------
# args
# ---------------------------------------------------------------------

arg_parser = argparse.ArgumentParser(description='Dump EDU text' )
arg_parser.add_argument('idir', metavar='DIR',
                        help='Input directory'
                        )
arg_parser.add_argument('odir', metavar='DIR',
                        help='Output directory'
                        )
arg_parser.add_argument('--ark-tweet-nlp', metavar='FILE',
                        help='Path to ark-tweet-nlp jar file'
                       )
arg_parser.add_argument('--corenlp', metavar='DIR',
                        help='Path to CoreNLP directory'
                       )
arg_parser.add_argument('--corenlp-server', metavar='DIR',
                        help='Launch/connect to CoreNLP server')
arg_parser.add_argument('--corenlp-address',
                        default='tcp://localhost:5900',
                        help='Address of server (use w corenlp-server)')
arg_parser.add_argument('--live',
                        action='store_const',
                        const=True,
                        help='"Live" data (not the annotated corpus)')
educe_group = arg_parser.add_argument_group('corpus filtering arguments')
util.add_corpus_filters(educe_group, fields=[ 'doc', 'subdoc' ])
args=arg_parser.parse_args()
args.stage     = 'unannotated'
args.annotator = None
is_interesting=util.mk_is_interesting(args)
pipelines = [args.ark_tweet_nlp, args.corenlp, args.corenlp_server]

if all(t is None for t in pipelines):
    print >> sys.stderr, "At least one pipeline must be specified"
    print >> sys.stderr, "See the --help option"
    sys.exit(1)

# ---------------------------------------------------------------------
# main
# ---------------------------------------------------------------------

if args.live:
    reader = educe.stac.LiveInputReader(args.idir)
    anno_files = reader.files()
else:
    reader = educe.stac.Reader(args.idir)
    anno_files = reader.filter(reader.files(), is_interesting)

corpus     = reader.slurp(anno_files, verbose=True)
if args.ark_tweet_nlp:
    postag.run_tagger(corpus, args.odir, args.ark_tweet_nlp)

if args.corenlp_server:
    config = ServerConfig(address=args.corenlp_address,
                          directory=args.corenlp_server,
                          output=sys.stderr)
    corenlp_server.run_pipeline(corpus, args.odir, config)
elif args.corenlp:
    corenlp.run_pipeline(corpus, args.odir, args.corenlp)