Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dnnc with no finetuning #1630

Merged
merged 61 commits into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
07e2cbb
Add mrpc binary head config
vaskonov Jul 18, 2022
43f8a9a
Fix binary head
vaskonov Jul 18, 2022
612c053
add few-shot infer support
nastyachizhikova Aug 23, 2022
9b49988
add few-shot metrics
nastyachizhikova Aug 23, 2022
40f9413
dnnc infer eval
Sep 21, 2022
ee36fe6
add preprocessor
Sep 21, 2022
a976260
init dnnc
Sep 28, 2022
a4dc57b
add dnnc training
Oct 19, 2022
45ee6e3
modified data processing
Nov 2, 2022
4209b05
fix imports
Nov 2, 2022
bbc83b7
change nli labels to strings
Nov 4, 2022
52e378f
add documentation
Nov 16, 2022
be99c31
fix conversion of labels to ids and ids to labels
Dec 5, 2022
612f08a
binary head dropout fix
Dec 5, 2022
50b7875
fix few-shot dos
Dec 5, 2022
96bc194
add return format flag
Dec 6, 2022
98cb660
add dataset and model downloading
Dec 6, 2022
b17791b
Fix: change paths
vaskonov Dec 7, 2022
d5c657d
Fix: change paths
vaskonov Dec 7, 2022
79ebde6
Fix: download paths
vaskonov Dec 7, 2022
a27b92a
remove skdlearn requirements
Dec 7, 2022
61db0e6
fex ix metrics
Dec 7, 2022
1d392b3
fix configs format
Dec 7, 2022
c188a55
Merge branch 'dnnc' of https://github.com/deeppavlov/DeepPavlov into …
Dec 7, 2022
39e6ad1
Fix: configs format and paths
Dec 7, 2022
dab193a
Upd: documentation
Dec 12, 2022
718d1de
Fix: typing
Dec 12, 2022
7cd75a5
Upd: add oos removal in iterator
Dec 12, 2022
e8cf40c
Fix: config format
Dec 12, 2022
72867ca
made the support dataset part of the input
Mar 7, 2023
c20d2d5
Fix: index.rst
Mar 13, 2023
5d7a198
Fix: index.rst
Mar 15, 2023
5742605
Fix: empty reference in docs
LogicZMaksimka Mar 27, 2023
6faccd3
Fix: metrics registry
LogicZMaksimka Mar 27, 2023
d711fa5
Fix: bidirectional scores averaging
LogicZMaksimka Mar 29, 2023
af23394
Fix: index.rst
LogicZMaksimka Mar 29, 2023
cdf0d9f
Conflicts resolved
LogicZMaksimka Mar 29, 2023
7702d65
refactor: minor style changes
IgnatovFedor Apr 7, 2023
fff9a9d
Fix: accuracy_oos arguments
LogicZMaksimka Apr 14, 2023
c705688
refactor: deleted a few-shot iterator that was not used anywhere
LogicZMaksimka Apr 19, 2023
a5975c8
Refactor: dnnc_preprocessor
LogicZMaksimka Apr 19, 2023
6344c34
Refactor: dnnc_proba2labels
LogicZMaksimka Apr 19, 2023
b4bd9f1
Refactor: config dnnc_infer
LogicZMaksimka Apr 19, 2023
799e64c
canceled changes in torch_transformers_classifier
LogicZMaksimka Apr 19, 2023
56ec9e3
Fix: removed few_shot_iterator from registry
LogicZMaksimka Apr 20, 2023
cbf89b3
Merge branch 'dev' with fixed bug
LogicZMaksimka Apr 28, 2023
255a425
fix: delete whitespaces
vaskonov Jun 14, 2023
8bdf2bd
fix: delete unused
vaskonov Jun 14, 2023
ebcff3a
fix: call arguments
vaskonov Jun 14, 2023
c6f57d0
fix: delete whitespaces
vaskonov Jun 14, 2023
dc55ea7
fix: remove unused
vaskonov Jun 14, 2023
243589b
fix: __call__ arguments
vaskonov Jun 14, 2023
ebcdcff
docs: optimizer few_shot_classification ipynb file
IgnatovFedor Jun 27, 2023
4537463
remove: trailing spaces
IgnatovFedor Jun 27, 2023
a05646b
fix: remove unused metrics
LogicZMaksimka Jun 27, 2023
628f1e3
remove: unused parameters
LogicZMaksimka Jun 28, 2023
f756dbe
docs: updated to new format
LogicZMaksimka Jun 28, 2023
f3783b5
Merge branch 'dev' into feat/dnnc_no_finetuning
LogicZMaksimka Jun 28, 2023
ca199dc
refactor: rename config
LogicZMaksimka Jun 28, 2023
fcc1d9d
docs: optimized few-shot classification doc
IgnatovFedor Jun 29, 2023
59348f5
feat: few-shot tests
IgnatovFedor Jun 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions deeppavlov/configs/classifiers/dnnc_infer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
{
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
"dataset_reader": {
"class_name": "basic_classification_reader",
"format": "json",
"orient": "split",
"x": "text",
"y": "category",
"data_path": "{DOWNLOADS_PATH}",
"train": "train.json",
"valid": "dev.json",
"test": "test.json"
},
"dataset_iterator": {
"class_name": "basic_classification_iterator",
"seed": 42,
"shuffle": true
},
"chainer": {
"in": ["input"],
"in_y": ["y_true"],
"pipe": [
{
"class_name": "dnnc_input_preprocessor",
"in": ["input"],
"out": ["x", "x_support", "x_populated", "y_support"],
"support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json",
"bidirectional": true
},
{
"class_name": "torch_transformers_preprocessor",
"in": ["x_populated", "x_support"],
"out": ["bert_features"],
"vocab_file": "{BASE_MODEL}",
"do_lower_case": true,
"max_seq_length": 128
},
{
"class_name": "torch_transformers_classifier",
"main": true,
"in": ["bert_features"],
"out": ["simmilarity_scores"],
"n_classes": 2,
"return_probas": true,
"pretrained_bert": "{BASE_MODEL}",
"save_path": "{MODEL_PATH}/model",
"load_path": "{MODEL_PATH}/model",
"is_binary": "{BINARY_CLASSIFICATION}",
"optimizer_parameters": {"lr": 2e-05}
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
},
{
"class_name": "dnnc_proba2labels",
"is_binary": "{BINARY_CLASSIFICATION}",
"in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"],
"out": ["y_pred"],
"multilabel": false,
"confidence_threshold": 0.0,
"pooling": "max"
}
],
"out": ["y_pred"]
},
"train": {
"batch_size": 1,
"metrics": [
{
"name": "accuracy_oos",
"inputs": ["y_true", "y_pred"],
"exclude_oos": true
},
{
"name": "oos_scores",
"inputs": ["y_true", "y_pred"]
}
],
"show_examples": false,
"evaluation_targets": ["test"],
"class_name": "torch_trainer"
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150",
"SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets",
"MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10",
"BINARY_CLASSIFICATION": true,
"BASE_MODEL": "roberta-base"
},
"download": [
{
"url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz",
"subdir": "{MODEL_PATH}"
},
{
"url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz",
"subdir": "{DOWNLOADS_PATH}"
}
]
}
}
3 changes: 2 additions & 1 deletion deeppavlov/core/common/metrics_registry.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"acc": "deeppavlov.metrics.accuracy:round_accuracy",
"accuracy": "deeppavlov.metrics.accuracy:accuracy",
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
"accuracy_oos": "deeppavlov.metrics.accuracy:accuracy_oos",
"average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1",
"average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1",
"bleu": "deeppavlov.metrics.bleu:bleu",
Expand All @@ -19,6 +19,7 @@
"multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy",
"ner_f1": "deeppavlov.metrics.fmeasure:ner_f1",
"ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1",
"oos_scores": "deeppavlov.metrics.fmeasure:oos_scores",
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
"pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation",
"per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu",
"per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy",
Expand Down
3 changes: 3 additions & 0 deletions deeppavlov/core/common/registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
"dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor",
"docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader",
"document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker",
"dnnc_input_preprocessor": "deeppavlov.models.preprocessors.dnnc_preprocessor:InputPreprocessor",
"dnnc_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels",
"entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser",
"entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker",
"faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader",
"few_shot_iterator": "deeppavlov.dataset_iterators.few_shot_iterator:FewShotIterator",
"fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder",
"fit_trainer": "deeppavlov.core.trainers.fit_trainer:FitTrainer",
"hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer",
Expand Down
147 changes: 147 additions & 0 deletions deeppavlov/dataset_iterators/few_shot_iterator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from logging import getLogger
from pathlib import Path
from random import Random
from typing import Dict, Any, List, Tuple, Optional

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.data_learning_iterator import DataLearningIterator

ENTAILMENT = 'entailment'
NON_ENTAILMENT = 'non_entailment'
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved

SUPPORT_DATASET_PATH = "~/.deeppavlov/preprocessed_datasets/support_dataset.json"
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved

log = getLogger(__name__)


@register('few_shot_iterator')
class FewShotIterator(DataLearningIterator):
def __init__(self,
data: Dict[str, List[Tuple[Any, Any]]],
seed: int = None,
shuffle: bool = True,
shot: Optional[int] = None,
shot_test: Optional[int] = None,
return_nli_format: bool = False,
*args, **kwargs) -> None:
self.shuffle = shuffle
self.random = Random(seed)

self.train = self.delete_oos(data.get('train', []))
self.valid = self.delete_oos(data.get('valid', []))
self.test = self.delete_oos(data.get('test', []))

self.train = self.get_shot_examples(self.train, shot)
self.valid = self.get_shot_examples(self.valid, shot_test)
self.test = self.get_shot_examples(self.test, shot_test)

save_path = Path(SUPPORT_DATASET_PATH).expanduser()
save_path.parent.mkdir(parents=True, exist_ok=True)
with save_path.open("w") as file:
json_dict = {
"columns": ["text", "category"],
"data": [[text, label] for text, label in self.train]
}
json.dump(json_dict, file, indent=4)
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved

if return_nli_format:
self.train = self.convert2nli(self.train)
self.valid = self.convert2nli(self.valid)
self.test = self.convert2nli(self.test)

self.data = {
'train': self.train,
'valid': self.valid,
'test': self.test,
'all': self.train + self.test + self.valid
}

def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]:
unique_labels = list(set([label for text, label in data]))

label2examples = {}
for label in unique_labels:
label2examples[label] = []
for text, label in data:
label2examples[label].append(text)
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved

label2negative = {}
for i, label in enumerate(unique_labels):
label2negative[label] = unique_labels.copy()
del label2negative[label][i]

return label2examples, label2negative

def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any], Any]]:
if len(data) == 0:
return data

label2examples, label2negative = self._gather_info(data)

nli_triplets = []
# negative examples
for text, label in data:
for negative_label in label2negative[label]:
for negative_example in label2examples[negative_label]:
nli_triplets.append([[text, negative_example], NON_ENTAILMENT])

# positive examples
for text, label in data:
for positive_example in label2examples[label]:
if positive_example != text:
nli_triplets.append([[text, positive_example], ENTAILMENT])

if self.shuffle:
self.random.shuffle(nli_triplets)

return nli_triplets

def delete_oos(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]:
filtered_data = []
for text, label in data:
if label != 'oos':
filtered_data.append([text, label])
return filtered_data

def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tuple[Any, Any]]:
if shot is None:
return data

# shuffle data to select shot-examples
self.random.shuffle(data)

data_dict = {}
for _, label in data:
data_dict[label] = []

for text, label in data:
if len(data_dict[label]) < shot:
data_dict[label].append(text)

if max(len(x) for x in data_dict.values()) < shot:
log.warning(f"Some labels have less than \"shot\"={shot} examples")

new_data = []
for label in data_dict.keys():
for text in data_dict[label]:
new_data.append((text, label))

if self.shuffle:
self.random.shuffle(new_data)

return new_data
15 changes: 14 additions & 1 deletion deeppavlov/metrics/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


import itertools
from typing import List, Iterable
from typing import List

import numpy as np

Expand Down Expand Up @@ -188,3 +188,16 @@ def kbqa_accuracy(y_true, y_predicted):
total_correct += 1

return total_correct / len(y_true) if len(y_true) else 0


@register_metric('accuracy_oos')
def accuracy_oos(y_true, y_pred, exclude_oos: bool = False) -> float:
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
if exclude_oos:
y_true = np.array(y_true)
y_pred = np.array(y_pred)

ind_mask = np.where(y_true == 'oos')

y_true = np.delete(y_true, ind_mask, 0)
y_pred = np.delete(y_pred, ind_mask, 0)
return accuracy(y_true, y_pred)
9 changes: 9 additions & 0 deletions deeppavlov/metrics/fmeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

from deeppavlov.core.common.metrics_registry import register_metric

Expand Down Expand Up @@ -417,3 +418,11 @@ def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_prob
roc_auc2 = roc_auc_score(true_onehot2, pred_probas2)
ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100
return (roc_auc1 + roc_auc2 + ner_f1_3) / 3


@register_metric('oos_scores')
def oos_scores(y_true, y_pred):
y_true_binary = (np.array(y_true) == "oos")
y_pred_binary = (np.array(y_pred) == "oos")
scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary')
return dict(zip(["precision", "recall", "fbeta_score"], scores[:3]))
IgnatovFedor marked this conversation as resolved.
Show resolved Hide resolved
Loading