From 1332d22b293c0a3907c393867b3febfb5dd768f8 Mon Sep 17 00:00:00 2001 From: Shrivu Shankar Date: Sat, 16 Nov 2019 09:35:05 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=95=B9=EF=B8=8F:=20CoreNLP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aletheia/api.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 aletheia/api.py diff --git a/aletheia/api.py b/aletheia/api.py new file mode 100644 index 0000000..44b55ea --- /dev/null +++ b/aletheia/api.py @@ -0,0 +1,34 @@ +import tempfile +import json +import os + +CORE_NLP_ANNOTS = 'tokenize,ssplit,pos,lemma,ner,depparse,coref,quote' +CORE_NLP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'stanford-corenlp-full-2018-10-05', '*')) +CORE_NLP_JAVA_ARGS = '-Xmx8g' + + +def run_corenlp(texts): + + temp_dir_file = tempfile.TemporaryFile(delete=False, suffix='.txt') + temp_files = [tempfile.TemporaryFile(delete=False, suffix='.txt') for _ in texts] + cleanup_fns = [temp.name for temp in temp_files] + [temp_dir_file.name] + + for i, text in enumerate(texts): + temp_files[i].write(bytes(text, 'utf-8')) + temp_dir_file.write(bytes(temp_files[i].name + '\n', 'utf-8')) + for temp_file in temp_files + [temp_dir_file]: + temp_file.close() + + cmd = 'java {} -cp "{}" edu.stanford.nlp.pipeline.StanfordCoreNLP [ -annotators {} -outputFormat json ] -filelist {}'.format(CORE_NLP_JAVA_ARGS, CORE_NLP_PATH, CORE_NLP_ANNOTS, temp_dir_file.name) + os.system(cmd) + + output = [] + for temp_file in temp_files: + output_fn = os.path.basename(temp_file.name) + '.json' + cleanup_fns.append(output_fn) + with open(output_fn, 'r') as f: + output.append(json.load(f)) + for fn in cleanup_fns: + os.remove(fn) + return output + \ No newline at end of file