Skip to content

add NAACL 2025 paper Refiner's code #210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions research/NEU/Refiner/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# DocREfiner

The code includes performance test scripts for evaluating only-LLM methods.

## `test_only_llama.py`

### Code Overview

This code uses Llama-2-7B to perform document-level relation extraction. The provided code demo is shown to process datasets, predict relations using the ATLOP SLM logits, and evaluate the results.

### Requirements

* Python 3.7+
* mindformers
* scikit-learn
* tqdm
* pandas
* docre (custom module)
* c2net (custom module)

### Installation

Install the required Python packages using pip:

```bash
pip install mindspore mindformers scikit-learn tqdm pandas openpyxl
```

Ensure the custom module `docre` is available in your Python path. Note that `c2net` is not necessary to use, you just need to replace all the paths with the `c2net` involved with your local paths.

### Usage

1. **Prepare the Environment**: Initialize the data context and set paths for datasets and pretrained models.
2. **Load Data and Models**: Use the provided functions to load datasets, relation templates, and pre-trained model logits.
3. **Generate Prompts**: Construct prompts and inputs for the model based on the loaded data.
4. **Run the Model**: Use the LLaMA2-7B model to generate predictions.
5. **Evaluate Results**: Save the model's predictions and evaluate them against the ground truth using the provided evaluation function.

#### Running the Script

Execute the script with Python:

```bash
python test_only_llama.py
```

The script will process the data, generate prompts, run the model, and evaluate the results. Output predictions will be saved to `dev_result_llama2_atlop.json`.

#### Example Output

The script prints example inputs and completions, showing the format of the processed data and the model's predictions.

```plaintext
INSTRUCTION: Read the DOCUMENT and answer the QUESTION. Write the answers in ANSWER.
DOCUMENT: ...
QUESTION: Which of the following is right?
...
ANSWER:
```

#### Evaluation

After running the script, the results are evaluated using the `evaluate` function, which compares the model's predictions with the ground truth and outputs performance metrics.

### Notes

- Ensure the `dataset_path`, `pretrain_model_path`, and other paths are correctly set according to your environment.
- Modify the top-k variable to change the number of top predictions considered.
- The script is set to ignore warnings for cleaner output.
185 changes: 185 additions & 0 deletions research/NEU/Refiner/docre/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#!/usr/bin/env python
import sys
import os
import os.path
import json

def gen_train_facts(data_file_name, truth_dir):
fact_file_name = data_file_name[data_file_name.find("train_"):]
fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact"))

if os.path.exists(fact_file_name):
fact_in_train = set([])
triples = json.load(open(fact_file_name))
for x in triples:
fact_in_train.add(tuple(x))
return fact_in_train

fact_in_train = set([])
ori_data = json.load(open(data_file_name))
for data in ori_data:
vertexSet = data['vertexSet']
for label in data['labels']:
rel = label['r']
for n1 in vertexSet[label['h']]:
for n2 in vertexSet[label['t']]:
fact_in_train.add((n1['name'], n2['name'], rel))

json.dump(list(fact_in_train), open(fact_file_name, "w"))

return fact_in_train

def evaluate(data_path = "./docred",
test_data = "dev.json",
result_data="./result.json",
output_path="./",
train_annotated_path = "/train_annotated.json",
compare_distant = True):
input_dir = data_path
truth_dir = os.path.join(input_dir, 'ref')

if not os.path.exists(truth_dir):
os.makedirs(truth_dir)

if os.path.isdir(truth_dir):
fact_in_train_annotated = gen_train_facts(data_path + train_annotated_path, truth_dir)
if compare_distant:
fact_in_train_distant = gen_train_facts(data_path + "/train_distant.json", truth_dir)
else:
fact_in_train_distant = set([])

output_filename = os.path.join(output_path, 'socres.txt')
output_file = open(output_filename, 'w')

truth_file = os.path.join(data_path, test_data)
truth = json.load(open(truth_file))

std = {}
tot_evidences = 0
titleset = set([])

title2vectexSet = {}

for x in truth:
title = x['title']
titleset.add(title)

vertexSet = x['vertexSet']
title2vectexSet[title] = vertexSet

for label in x['labels']:
r = label['r']

h_idx = label['h']
t_idx = label['t']
std[(title, r, h_idx, t_idx)] = set(label['evidence'])
tot_evidences += len(label['evidence'])

tot_relations = len(std)

# submission_answer_file = os.path.join(result_path, "result.json")
submission_answer_file = result_data
tmp = json.load(open(submission_answer_file))
tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r']))
submission_answer = [tmp[0]]
for i in range(1, len(tmp)):
x = tmp[i]
y = tmp[i-1]
if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']):
submission_answer.append(tmp[i])
# else:
# print("remove", x['title'], x['h_idx'], x['t_idx'], x['r'])

correct_re = 0
correct_evidence = 0
pred_evi = 0

correct_in_train_annotated = 0
correct_in_train_distant = 0
titleset2 = set([])


for x in submission_answer:
title = x['title']
h_idx = x['h_idx']
t_idx = x['t_idx']
r = x['r']

titleset2.add(title)
if title not in title2vectexSet:
continue

vertexSet = title2vectexSet[title]

if 'evidence' in x:
evi = set(x['evidence'])
else:
evi = set([])

pred_evi += len(evi)

if (title, r, h_idx, t_idx) in std:
correct_re += 1
stdevi = std[(title, r, h_idx, t_idx)]
correct_evidence += len(stdevi & evi)
in_train_annotated = in_train_distant = False
for n1 in vertexSet[h_idx]:
for n2 in vertexSet[t_idx]:
if (n1['name'], n2['name'], r) in fact_in_train_annotated:
in_train_annotated = True
if (n1['name'], n2['name'], r) in fact_in_train_distant:
in_train_distant = True

if in_train_annotated:
correct_in_train_annotated += 1
if in_train_distant:
correct_in_train_distant += 1

re_p = 1.0 * correct_re / len(submission_answer)

re_r = 1.0 * correct_re / tot_relations
if re_p+re_r == 0:
re_f1 = 0
else:
re_f1 = 2.0 * re_p * re_r / (re_p + re_r)

evi_p = 1.0 * correct_evidence / pred_evi if pred_evi>0 else 0
evi_r = 1.0 * correct_evidence / (tot_evidences+0.000000000000000001)
if evi_p+evi_r == 0:
evi_f1 = 0
else:
evi_f1 = 2.0 * evi_p * evi_r / (evi_p + evi_r)

re_p_ignore_train_annotated = 1.0 * (correct_re-correct_in_train_annotated) / (len(submission_answer)-correct_in_train_annotated)
re_p_ignore_train = 1.0 * (correct_re-correct_in_train_distant) / (len(submission_answer)-correct_in_train_distant)

if re_p_ignore_train_annotated+re_r == 0:
re_f1_ignore_train_annotated = 0
else:
re_f1_ignore_train_annotated = 2.0 * re_p_ignore_train_annotated * re_r / (re_p_ignore_train_annotated + re_r)

if re_p_ignore_train+re_r == 0:
re_f1_ignore_train = 0
else:
re_f1_ignore_train = 2.0 * re_p_ignore_train * re_r / (re_p_ignore_train + re_r)


print("Precision:",re_p)
print("Recall:",re_r)
print ('RE_F1:', re_f1)
print ('Evi_F1:', evi_f1)
print ('RE_ign_F1:', re_f1_ignore_train_annotated)
print ('RE_ignore_distant_F1:', re_f1_ignore_train)


output_file.write("Precision: %f\n" % re_p)
output_file.write("Recall: %f\n" % re_r)

output_file.write("RE_F1: %f\n" % re_f1)
output_file.write("Evi_F1: %f\n" % evi_f1)

output_file.write("RE_ignore_annotated_F1: %f\n" % re_f1_ignore_train_annotated)
output_file.write("RE_ignore_distant_F1: %f\n" % re_f1_ignore_train)


output_file.close()
111 changes: 111 additions & 0 deletions research/NEU/Refiner/docre/processData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import json
import pandas as pd
import re


def remove_space_before_punctuation(text):
cleaned_text = re.sub(r'\s+([.,\'\"])', r'\1', text)
cleaned_text = re.sub(r'\(\s+', '(', cleaned_text)
cleaned_text = re.sub(r'\s+\)', ')', cleaned_text)
cleaned_text = re.sub(r'\s*-\s*', '-', cleaned_text)

return cleaned_text


def return_rel2dict(file_path = './dataset/docred/rel_info.json'):
fr = open(file_path, 'r', encoding='utf-8')
rel_info = fr.read()
rel_info = eval(rel_info)

p_to_num = {}
num_to_p = {}
for i,key in enumerate(rel_info.keys()):
p_to_num[key] = i
num_to_p[i] = key
num_to_p[len(rel_info.keys())] = 'NA'
p_to_num['NA'] = len(rel_info.keys())

p_to_name = {}
name_to_p = {}
for key in rel_info.keys():
p_to_name[key] = rel_info[key]
name_to_p[rel_info[key]] = key
p_to_name['NA'] = 'NA'
name_to_p['NA'] = 'NA'
return p_to_num, num_to_p, p_to_name, name_to_p

def return_templates(file_path = './dataNEW/rel_templates.xlsx'):
df_templates = pd.read_excel(file_path)

p2templates = {}
ps = df_templates['relation ID'].values
templates = df_templates['relation template'].values
for i,p in enumerate(ps):
p2templates[p.strip()] = templates[i]

return p2templates

def return_docred(file_path = './dataset/docred/dev.json',test_data=False):
fr = open(file_path, 'r', encoding='utf-8')
json_info = fr.read()
df = pd.read_json(json_info)

titles = []
for i in range(len(df['vertexSet'])):
titles.append(df['title'][i])

entities = []
for i in range(len(df['vertexSet'])):
enames = []
for entity_class in df['vertexSet'][i]:
ename = set()
for entity_name in entity_class:
ename.add(entity_name['name'])
enames.append(list(ename))
entities.append(enames)

entity_types = []
for i in range(len(df['vertexSet'])):
etypes = []
for entity_class in df['vertexSet'][i]:
entity_type = set()
for entity_name in entity_class:
entity_type.add(entity_name['type'])
etypes.append(list(entity_type)[0])
entity_types.append(etypes)

entity_indexs = []
for i in range(len(df['vertexSet'])):
eindexs = []
for entity_class in df['vertexSet'][i]:
eindex = set()
for entity_name in entity_class:
eindex.add(entity_name['sent_id'])
eindexs.append(list(eindex))
entity_indexs.append(eindexs)

documents_raw = []
for i in range(len(df['sents'])):
document_raw = []
for j,sentence in enumerate(df['sents'][i]):
sentence_str = ""
for word in sentence[:-1]:
sentence_str += word
sentence_str += " "
sentence_str += sentence[-1]
document_raw.append(remove_space_before_punctuation(sentence_str))
documents_raw.append(document_raw)

relations = []
if test_data == False:
for i in range(len(df['sents'])):
relation = df['labels'][i]
relations.append(relation)
else:
relations = []
return titles, entities, entity_types, entity_indexs, documents_raw, relations




Loading