mindspore-lab · Drasick · May 9, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
diff --git a/research/NEU/Refiner/README.md b/research/NEU/Refiner/README.md
@@ -0,0 +1,69 @@
+# DocREfiner
+
+The code includes performance test scripts for evaluating only-LLM methods.
+
+## `test_only_llama.py`
+
+### Code Overview
+
+This code uses Llama-2-7B to perform document-level relation extraction. The provided code demo is shown to process datasets, predict relations using the ATLOP SLM logits, and evaluate the results.
+
+### Requirements
+
+* Python 3.7+
+* mindformers
+* scikit-learn
+* tqdm
+* pandas
+* docre (custom module)
+* c2net (custom module)
+
+### Installation
+
+Install the required Python packages using pip:
+
+```bash
+pip install mindspore mindformers scikit-learn tqdm pandas openpyxl
+```
+
+Ensure the custom module `docre` is available in your Python path. Note that `c2net` is not necessary to use, you just need to replace all the paths with the `c2net` involved with your local paths.
+
+### Usage
+
+1. **Prepare the Environment**: Initialize the data context and set paths for datasets and pretrained models.
+2. **Load Data and Models**: Use the provided functions to load datasets, relation templates, and pre-trained model logits.
+3. **Generate Prompts**: Construct prompts and inputs for the model based on the loaded data.
+4. **Run the Model**: Use the LLaMA2-7B model to generate predictions.
+5. **Evaluate Results**: Save the model's predictions and evaluate them against the ground truth using the provided evaluation function.
+
+#### Running the Script
+
+Execute the script with Python:
+
+```bash
+python test_only_llama.py
+```
+
+The script will process the data, generate prompts, run the model, and evaluate the results. Output predictions will be saved to `dev_result_llama2_atlop.json`.
+
+#### Example Output
+
+The script prints example inputs and completions, showing the format of the processed data and the model's predictions.
+
+```plaintext
+INSTRUCTION: Read the DOCUMENT and answer the QUESTION. Write the answers in ANSWER.
+DOCUMENT: ...
+QUESTION: Which of the following is right?
+...
+ANSWER: 
+```
+
+#### Evaluation
+
+After running the script, the results are evaluated using the `evaluate` function, which compares the model's predictions with the ground truth and outputs performance metrics.
+
+### Notes
+
+- Ensure the `dataset_path`, `pretrain_model_path`, and other paths are correctly set according to your environment.
+- Modify the top-k variable to change the number of top predictions considered.
+- The script is set to ignore warnings for cleaner output.
diff --git a/research/NEU/Refiner/docre/evaluation.py b/research/NEU/Refiner/docre/evaluation.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+import sys
+import os
+import os.path
+import json
+
+def gen_train_facts(data_file_name, truth_dir):
+    fact_file_name = data_file_name[data_file_name.find("train_"):]
+    fact_file_name = os.path.join(truth_dir, fact_file_name.replace(".json", ".fact"))
+
+    if os.path.exists(fact_file_name):
+        fact_in_train = set([])
+        triples = json.load(open(fact_file_name))
+        for x in triples:
+            fact_in_train.add(tuple(x))
+        return fact_in_train
+
+    fact_in_train = set([])
+    ori_data = json.load(open(data_file_name))
+    for data in ori_data:
+        vertexSet = data['vertexSet']
+        for label in data['labels']:
+            rel = label['r']
+            for n1 in vertexSet[label['h']]:
+                for n2 in vertexSet[label['t']]:
+                    fact_in_train.add((n1['name'], n2['name'], rel))
+
+    json.dump(list(fact_in_train), open(fact_file_name, "w"))
+
+    return fact_in_train
+
+def evaluate(data_path = "./docred", 
+             test_data = "dev.json", 
+             result_data="./result.json", 
+             output_path="./", 
+             train_annotated_path = "/train_annotated.json", 
+             compare_distant = True):
+    input_dir = data_path
+    truth_dir = os.path.join(input_dir, 'ref')
+
+    if not os.path.exists(truth_dir):
+        os.makedirs(truth_dir)
+
+    if os.path.isdir(truth_dir):
+        fact_in_train_annotated = gen_train_facts(data_path + train_annotated_path, truth_dir)
+        if compare_distant:
+            fact_in_train_distant = gen_train_facts(data_path + "/train_distant.json", truth_dir)
+        else:
+            fact_in_train_distant = set([])
+
+        output_filename = os.path.join(output_path, 'socres.txt')
+        output_file = open(output_filename, 'w')
+
+        truth_file = os.path.join(data_path, test_data)
+        truth = json.load(open(truth_file))
+
+        std = {}
+        tot_evidences = 0
+        titleset = set([])
+
+        title2vectexSet = {}
+
+        for x in truth:
+            title = x['title']
+            titleset.add(title)
+
+            vertexSet = x['vertexSet']
+            title2vectexSet[title] = vertexSet
+
+            for label in x['labels']:
+                r = label['r']
+
+                h_idx = label['h']
+                t_idx = label['t']
+                std[(title, r, h_idx, t_idx)] = set(label['evidence'])
+                tot_evidences += len(label['evidence'])
+
+        tot_relations = len(std)
+
+#         submission_answer_file = os.path.join(result_path, "result.json")
+        submission_answer_file = result_data
+        tmp = json.load(open(submission_answer_file))
+        tmp.sort(key=lambda x: (x['title'], x['h_idx'], x['t_idx'], x['r']))
+        submission_answer = [tmp[0]]
+        for i in range(1, len(tmp)):
+            x = tmp[i]
+            y = tmp[i-1]
+            if (x['title'], x['h_idx'], x['t_idx'], x['r']) != (y['title'], y['h_idx'], y['t_idx'], y['r']):
+                submission_answer.append(tmp[i])
+    #         else:
+    #             print("remove", x['title'], x['h_idx'], x['t_idx'], x['r'])
+
+        correct_re = 0
+        correct_evidence = 0 
+        pred_evi = 0 
+
+        correct_in_train_annotated = 0
+        correct_in_train_distant = 0
+        titleset2 = set([])
+
+
+        for x in submission_answer:
+            title = x['title']
+            h_idx = x['h_idx']
+            t_idx = x['t_idx']
+            r = x['r']
+
+            titleset2.add(title)
+            if title not in title2vectexSet:
+                continue
+
+            vertexSet = title2vectexSet[title]
+
+            if 'evidence' in x:
+                evi = set(x['evidence'])
+            else:
+                evi = set([])
+
+            pred_evi += len(evi)
+
+            if (title, r, h_idx, t_idx) in std:
+                correct_re += 1
+                stdevi = std[(title, r, h_idx, t_idx)]
+                correct_evidence += len(stdevi & evi)
+                in_train_annotated = in_train_distant = False
+                for n1 in vertexSet[h_idx]:
+                    for n2 in vertexSet[t_idx]:
+                        if (n1['name'], n2['name'], r) in fact_in_train_annotated:
+                            in_train_annotated = True
+                        if (n1['name'], n2['name'], r) in fact_in_train_distant:
+                            in_train_distant = True
+
+                if in_train_annotated:
+                    correct_in_train_annotated += 1
+                if in_train_distant:
+                    correct_in_train_distant += 1
+
+        re_p = 1.0 * correct_re / len(submission_answer)
+
+        re_r = 1.0 * correct_re / tot_relations
+        if re_p+re_r == 0:
+            re_f1 = 0
+        else:
+            re_f1 = 2.0 * re_p * re_r / (re_p + re_r)
+
+        evi_p = 1.0 * correct_evidence / pred_evi if pred_evi>0 else 0
+        evi_r = 1.0 * correct_evidence / (tot_evidences+0.000000000000000001)
+        if evi_p+evi_r == 0:
+            evi_f1 = 0
+        else:
+            evi_f1 = 2.0 * evi_p * evi_r / (evi_p + evi_r)
+
+        re_p_ignore_train_annotated = 1.0 * (correct_re-correct_in_train_annotated) / (len(submission_answer)-correct_in_train_annotated)
+        re_p_ignore_train = 1.0 * (correct_re-correct_in_train_distant) / (len(submission_answer)-correct_in_train_distant)
+
+        if re_p_ignore_train_annotated+re_r == 0:
+            re_f1_ignore_train_annotated = 0
+        else:
+            re_f1_ignore_train_annotated = 2.0 * re_p_ignore_train_annotated * re_r / (re_p_ignore_train_annotated + re_r)
+
+        if re_p_ignore_train+re_r == 0:
+            re_f1_ignore_train = 0
+        else:
+            re_f1_ignore_train = 2.0 * re_p_ignore_train * re_r / (re_p_ignore_train + re_r)
+
+
+        print("Precision:",re_p)
+        print("Recall:",re_r)
+        print ('RE_F1:', re_f1)
+        print ('Evi_F1:', evi_f1)
+        print ('RE_ign_F1:', re_f1_ignore_train_annotated)
+        print ('RE_ignore_distant_F1:', re_f1_ignore_train)
+
+
+        output_file.write("Precision: %f\n" % re_p)
+        output_file.write("Recall: %f\n" % re_r)
+
+        output_file.write("RE_F1: %f\n" % re_f1)
+        output_file.write("Evi_F1: %f\n" % evi_f1)
+
+        output_file.write("RE_ignore_annotated_F1: %f\n" % re_f1_ignore_train_annotated)
+        output_file.write("RE_ignore_distant_F1: %f\n" % re_f1_ignore_train)
+
+
+        output_file.close()
diff --git a/research/NEU/Refiner/docre/processData.py b/research/NEU/Refiner/docre/processData.py
@@ -0,0 +1,111 @@
+import os
+import json
+import pandas as pd
+import re
+
+
+def remove_space_before_punctuation(text):
+    cleaned_text = re.sub(r'\s+([.,\'\"])', r'\1', text)
+    cleaned_text = re.sub(r'\(\s+', '(', cleaned_text)
+    cleaned_text = re.sub(r'\s+\)', ')', cleaned_text)
+    cleaned_text = re.sub(r'\s*-\s*', '-', cleaned_text)
+
+    return cleaned_text
+
+
+def return_rel2dict(file_path = './dataset/docred/rel_info.json'):
+    fr = open(file_path, 'r', encoding='utf-8')
+    rel_info = fr.read()
+    rel_info = eval(rel_info)
+
+    p_to_num = {}
+    num_to_p = {}
+    for i,key in enumerate(rel_info.keys()):
+        p_to_num[key] = i
+        num_to_p[i] = key
+    num_to_p[len(rel_info.keys())] = 'NA'
+    p_to_num['NA'] = len(rel_info.keys())
+
+    p_to_name = {}
+    name_to_p = {}
+    for key in rel_info.keys():
+        p_to_name[key] = rel_info[key]
+        name_to_p[rel_info[key]] = key
+    p_to_name['NA'] = 'NA'
+    name_to_p['NA'] = 'NA'
+    return p_to_num, num_to_p, p_to_name, name_to_p
+
+def return_templates(file_path = './dataNEW/rel_templates.xlsx'):
+    df_templates = pd.read_excel(file_path)
+
+    p2templates = {}
+    ps = df_templates['relation ID'].values
+    templates = df_templates['relation template'].values
+    for i,p in enumerate(ps):
+        p2templates[p.strip()] = templates[i]
+
+    return p2templates
+
+def return_docred(file_path = './dataset/docred/dev.json',test_data=False): 
+    fr = open(file_path, 'r', encoding='utf-8')
+    json_info = fr.read()
+    df = pd.read_json(json_info)
+
+    titles = []
+    for i in range(len(df['vertexSet'])):
+        titles.append(df['title'][i])
+
+    entities = []
+    for i in range(len(df['vertexSet'])):
+        enames = []
+        for entity_class in df['vertexSet'][i]:
+            ename = set()
+            for entity_name in entity_class:
+                ename.add(entity_name['name'])
+            enames.append(list(ename))
+        entities.append(enames)
+
+    entity_types = []
+    for i in range(len(df['vertexSet'])):
+        etypes = []
+        for entity_class in df['vertexSet'][i]:
+            entity_type = set()
+            for entity_name in entity_class:
+                entity_type.add(entity_name['type'])
+            etypes.append(list(entity_type)[0])
+        entity_types.append(etypes)
+
+    entity_indexs = []
+    for i in range(len(df['vertexSet'])):
+        eindexs = []
+        for entity_class in df['vertexSet'][i]:
+            eindex = set()
+            for entity_name in entity_class:
+                eindex.add(entity_name['sent_id'])
+            eindexs.append(list(eindex))
+        entity_indexs.append(eindexs)
+
+    documents_raw = []
+    for i in range(len(df['sents'])):
+        document_raw = []
+        for j,sentence in enumerate(df['sents'][i]):
+            sentence_str = ""
+            for word in sentence[:-1]:
+                sentence_str += word
+                sentence_str += " "
+            sentence_str += sentence[-1]
+            document_raw.append(remove_space_before_punctuation(sentence_str))
+        documents_raw.append(document_raw) 
+
+    relations = []
+    if test_data == False:
+        for i in range(len(df['sents'])):
+            relation = df['labels'][i]
+            relations.append(relation)
+    else:
+        relations = []    
+    return titles, entities, entity_types, entity_indexs, documents_raw, relations 
+
+
+
+