Skip to content

Commit

Permalink
🕹️: CoreNLP
Browse files Browse the repository at this point in the history
  • Loading branch information
sshh12 committed Nov 16, 2019
1 parent be0331a commit 1332d22
Showing 1 changed file with 34 additions and 0 deletions.
34 changes: 34 additions & 0 deletions aletheia/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import tempfile
import json
import os

CORE_NLP_ANNOTS = 'tokenize,ssplit,pos,lemma,ner,depparse,coref,quote'
CORE_NLP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'stanford-corenlp-full-2018-10-05', '*'))
CORE_NLP_JAVA_ARGS = '-Xmx8g'


def run_corenlp(texts):

temp_dir_file = tempfile.TemporaryFile(delete=False, suffix='.txt')
temp_files = [tempfile.TemporaryFile(delete=False, suffix='.txt') for _ in texts]
cleanup_fns = [temp.name for temp in temp_files] + [temp_dir_file.name]

for i, text in enumerate(texts):
temp_files[i].write(bytes(text, 'utf-8'))
temp_dir_file.write(bytes(temp_files[i].name + '\n', 'utf-8'))
for temp_file in temp_files + [temp_dir_file]:
temp_file.close()

cmd = 'java {} -cp "{}" edu.stanford.nlp.pipeline.StanfordCoreNLP [ -annotators {} -outputFormat json ] -filelist {}'.format(CORE_NLP_JAVA_ARGS, CORE_NLP_PATH, CORE_NLP_ANNOTS, temp_dir_file.name)
os.system(cmd)

output = []
for temp_file in temp_files:
output_fn = os.path.basename(temp_file.name) + '.json'
cleanup_fns.append(output_fn)
with open(output_fn, 'r') as f:
output.append(json.load(f))
for fn in cleanup_fns:
os.remove(fn)
return output

0 comments on commit 1332d22

Please sign in to comment.