Skip to content

WIP: Integration of WikiKB into NEL benchmark #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 39 commits into
base: v3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1c1a5fa
WIP: speedup for entity embedding with .pipe(); improvements to Wiki …
rmitsch Oct 20, 2022
6fd27f7
Remove redundant test code.
rmitsch Oct 20, 2022
785fb0d
Adjust to new DB structure.
rmitsch Oct 24, 2022
432ce49
Update project.yml and test.
rmitsch Oct 24, 2022
0d9cdd4
Fix project.yml.
rmitsch Oct 24, 2022
de3a465
Update project.yml to install wikid.
rmitsch Oct 24, 2022
1ae4ea0
Add setup for wikid.
rmitsch Oct 24, 2022
f5d6d89
Update test and step sequence.
rmitsch Oct 24, 2022
a557e78
Various fixes w.r.t. wikid integration.
rmitsch Oct 26, 2022
0dbc934
Fix parameter passing to parse_corpus.
rmitsch Oct 26, 2022
6d96f20
Revert unrelated test changes to UD parser config.
rmitsch Oct 26, 2022
f0cb64a
Add comment.
rmitsch Oct 26, 2022
e166144
Remove assets command from test.
rmitsch Oct 26, 2022
dc7977b
Remove assets command from test.
rmitsch Oct 26, 2022
e6fb7fd
Change description for 'preprocess' step.
rmitsch Oct 26, 2022
dc2e289
Fix configuration error.
rmitsch Oct 26, 2022
5989cc1
Remove max_epochs limit.
rmitsch Oct 26, 2022
5e5e1fd
Remove comment in nel.cfg. Reintroduce epochs override in CI.
rmitsch Oct 26, 2022
32ab441
Disable overrides temporarily for wikid calls.
rmitsch Oct 27, 2022
aaa40b7
Clean up test.
rmitsch Oct 27, 2022
8ff7095
Readd overrides in test.
rmitsch Oct 27, 2022
df67880
Set default CG to reduce test time.
rmitsch Oct 27, 2022
8e1bdc9
Fix typo.
rmitsch Oct 27, 2022
40f66eb
Update wikid repo URL.
rmitsch Oct 28, 2022
2ed7900
Remove test code.
rmitsch Oct 28, 2022
2051342
Start transition towards WikiKB.
rmitsch Nov 21, 2022
1533943
Changes for integration of WikiKB into project workflow.
rmitsch Nov 23, 2022
c0d7ebf
Trigger new tests.
rmitsch Nov 23, 2022
fcdc3c0
Merge branch 'v3' into feature/nel-benchmark-wikikb
rmitsch Nov 23, 2022
9757d35
Further adjustments for new-style KBs and removal of NER.
rmitsch Nov 24, 2022
bddbbdb
Fix entity handling. Simplify workflow in Dataset class.
rmitsch Nov 29, 2022
de92c61
Switch evaluation to get_candidates_all().
rmitsch Nov 29, 2022
ecb3d87
Adjust config.
rmitsch Nov 29, 2022
adf8a7e
Further adjustments in NEL workflow + script for debugging training v…
rmitsch Dec 1, 2022
7bc9dd8
Refactor. Start to add support for separate mention-candidate retriev…
rmitsch Dec 5, 2022
c823add
Fix NEL training.
rmitsch Dec 6, 2022
ff6a3d3
Update comments and config.
rmitsch Dec 7, 2022
db25042
Clean up and adjust config.
rmitsch Dec 7, 2022
946b5ee
Remove unused parameter 'db' from nel.cfg.
rmitsch Jan 19, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 17 additions & 19 deletions benchmarks/nel/configs/nel.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["senter","ner","entity_linker"]
pipeline = ["entity_linker"]
disabled = []
before_creation = null
after_creation = null
Expand All @@ -27,23 +27,19 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.senter]
source = "${paths.base_nlp}"

[components.ner]
source = "${paths.base_nlp}"
component = "ner"

[components.entity_linker]
factory = "entity_linker"
entity_vector_length = 64
incl_context = true
incl_prior = true
incl_context = True
incl_prior = True
labels_discard = []
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
get_candidates_all = {"@misc":"spacy.CandidateAllGenerator.v1"}
candidates_doc_mode = True
generate_empty_kb = {"@misc":"spacy.EmptyWikiKB.v1"}

[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
@architectures = "spacy.EntityLinker.v2"
nO = null

[components.entity_linker.model.tok2vec]
Expand All @@ -68,7 +64,7 @@ lookups = null
[initialize.components.entity_linker]

[initialize.components.entity_linker.kb_loader]
@misc = "spacy.KBFromFile.v1"
@misc = "spacy.WikiKBFromFile.v1"
kb_path = ${paths.kb}

[initialize.tokenizer]
Expand All @@ -77,12 +73,14 @@ kb_path = ${paths.kb}
[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
@readers = "EntityEnrichedCorpusReader.v1"
path = ${paths.train}
path_nlp_base = ${paths.vectors}

[corpora.dev]
@readers = "spacy.Corpus.v1"
@readers = "EntityEnrichedCorpusReader.v1"
path = ${paths.dev}
path_nlp_base = ${paths.vectors}


[training]
Expand All @@ -94,15 +92,15 @@ dropout = 0.2
patience = 10000
eval_frequency = 200
accumulate_gradient = 2
max_epochs = 0
max_steps = 500
annotating_components = ["senter"]
frozen_components = ["senter","ner"]
max_epochs = 25
max_steps = 10000
annotating_components = []
frozen_components = []
before_to_disk = null

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
progress_bar = true

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
Expand Down
76 changes: 43 additions & 33 deletions benchmarks/nel/project.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
title: 'NEL Benchmark'
description: "Pipeline for benchmarking NEL approaches (incl. candidate generation and entity disambiguation)."
vars:
run: "cg-default"
run: "default"
language: "en"
config: "nel.cfg"
vectors_model: "en_core_web_lg"
version: "0.0.5"
base_model: "en_core_web_lg"
version: "0.0.6"
dataset: "mewsli_9"
gpu_id: ""
download_all_wiki_assets: "" # "--extra" to download full Wiki dumps.
filter: "True" # Whether to only use parts of Wiki data and corpus containing filter terms.
training_max_steps: 1000
training_max_steps: 10000
eval_highlight_metric: "F" # one of ("F", "r", "p")

directories: ["assets", "training", "configs", "scripts", "corpora", "evaluation"]
directories: ["assets", "training", "configs", "src", "corpora", "evaluation"]

check_requirements: True

workflows:
all:
Expand All @@ -24,7 +26,7 @@ workflows:
- wikid_download_assets
- wikid_parse
- wikid_create_kb
- parse_corpus
- extract_annotations
- compile_corpora
- train
- evaluate
Expand All @@ -37,27 +39,25 @@ commands:
- name: download_mewsli9
help: Download Mewsli-9 dataset.
script:
- bash scripts/datasets/download_mewsli-9.sh
- bash src/datasets/download_mewsli-9.sh
outputs:
- assets/mewsli_9/raw
- assets/mewsli_9/

- name: download_model
help: "Download a model with pretrained vectors and NER component."
script:
- "python -m spacy download ${vars.vectors_model}"
- "python -m spacy download ${vars.base_model}"

- name: wikid_clone
help: Clone `wikid` to prepare Wiki database and `KnowledgeBase`.
help: "Clone `wikid` to prepare Wiki database and `KnowledgeBase`."
script:
- git clone https://github.com/explosion/wikid.git --branch main
- pip install -r wikid/requirements.txt
outputs:
- wikid
- "git clone https://github.com/rmitsch/wikid.git --branch fix/reestablish-db-connection-after-load"
- "pip install -r wikid/requirements.txt"

- name: preprocess
help: Preprocess and clean corpus data.
script:
- "env PYTHONPATH=. python ./scripts/clean_data.py ${vars.dataset} ${vars.language}"
- "env PYTHONPATH=. python ./src/cli_clean_data.py ${vars.dataset} ${vars.language}"
deps:
- "assets/${vars.dataset}/raw"
outputs:
Expand All @@ -73,75 +73,85 @@ commands:
- name: wikid_parse
help: "Parse Wikipedia dumps. This can take a long time if you're not using the filtered dumps!"
script:
- "spacy project run parse wikid --vars.language ${vars.language} --vars.filter True"
- "spacy project run parse wikid --vars.language ${vars.language} --vars.filter ${vars.filter}"
outputs:
- "wikid/output/${vars.language}/wiki.sqlite3"

- name: wikid_create_kb
help: "Create the knowledge base and write it to file."
script:
- "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.vectors_model}"
- "spacy project run create_kb wikid --vars.language ${vars.language} --vars.vectors_model ${vars.base_model} --force"
deps:
- "wikid/output/${vars.language}/wiki.sqlite3"
outputs_no_cache:
outputs:
- "wikid/output/${vars.language}/kb"
- "wikid/output/${vars.language}/nlp"
- "wikid/output/${vars.language}/wiki.annoy"

- name: parse_corpus
help: "Parse corpus to generate entity and annotation lookups used for corpora compilation."
- name: extract_annotations
help: "Extract annotations from corpus."
script:
- "env PYTHONPATH=. python ./scripts/parse_corpus.py ${vars.dataset} ${vars.language}"
- "env PYTHONPATH=. python ./src/cli_extract_annotations.py ${vars.dataset} ${vars.language}"
deps:
- "assets/${vars.dataset}/clean"
- "wikid/output/${vars.language}/wiki.sqlite3"
outputs:
- "assets/${vars.dataset}/entities.pkl"
- "assets/${vars.dataset}/entities_failed_lookup.pkl"
- "assets/${vars.dataset}/annotations.pkl"

- name: compile_corpora
help: "Compile corpora, separated in train/dev/test sets."
script:
- "env PYTHONPATH=. python ./scripts/compile_corpora.py ${vars.dataset} ${vars.language} ${vars.filter}"
- "env PYTHONPATH=. python ./src/cli_compile_corpora.py ${vars.dataset} ${vars.language} ${vars.base_model} ${vars.filter}"
deps:
- "assets/${vars.dataset}/entities.pkl"
- "assets/${vars.dataset}/entities_failed_lookups.pkl"
- "assets/${vars.dataset}/annotations.pkl"
- "wikid/output/${vars.language}/kb"
- "wikid/output/${vars.language}/nlp"
- "configs/datasets.yml"
outputs:
- "corpora/${vars.dataset}/train.spacy"
- "corpora/${vars.dataset}/dev.spacy"
- "corpora/${vars.dataset}/test.spacy"

- name: retrieve_mentions_candidates
help: "Retrieve candidates for mentions in corpus and persist them in file. This is an optional step, but speeds up training and evaluation"
script:
- "env PYTHONPATH=. python ./src/cli_retrieve_mentions_candidates.py ${vars.dataset} ${vars.language}"
deps:
- "wikid/output/${vars.language}/kb"
- "wikid/output/${vars.language}/wiki.annoy"
- "wikid/output/${vars.language}/wiki.sqlite3"
- "assets/${vars.dataset}/annotations.pkl"
outputs:
- "corpora/${vars.dataset}/mention_candidates.pkl"

- name: train
help: "Train a new Entity Linking component. Pass --vars.gpu_id GPU_ID to train with GPU. Training with some datasets may take a long time!"
script:
- "bash scripts/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
- "bash src/train.sh ${vars.dataset} '${vars.run}' ${vars.language} ${vars.config} ${vars.training_max_steps} ${vars.gpu_id}"
outputs:
- "training/${vars.dataset}/${vars.run}"
deps:
- "wikid/output/${vars.language}/kb"
- "wikid/output/${vars.language}/nlp"
- "wikid/output/${vars.language}/wiki.annoy"
- "training/base-nlp/${vars.language}"
- "corpora/${vars.dataset}/train.spacy"
- "corpora/${vars.dataset}/dev.spacy"

- name: evaluate
help: "Evaluate on the test set."
script:
- "env PYTHONPATH=. python ./scripts/evaluate.py ${vars.dataset} '${vars.run}' ${vars.language}"
- "env PYTHONPATH=. python ./src/cli_evaluate.py ${vars.dataset} '${vars.run}' ${vars.language} ${vars.gpu_id}"
deps:
- "training/${vars.dataset}/${vars.run}/model-best"
- "wikid/output/${vars.language}/nlp"
- "training/base-nlp/${vars.language}"
- "wikid/output/${vars.language}/wiki.annoy"
- "training/base-nlp/${vars.language}"
- "corpora/${vars.dataset}/dev.spacy"
outputs:
- "evaluation/${vars.dataset}"

- name: compare_evaluations
help: "Compare available set of evaluation runs."
script:
- "env PYTHONPATH=. python ./scripts/compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
- "env PYTHONPATH=. python ./src/cli_compare_evaluations.py ${vars.dataset} ${vars.language} --highlight-criterion ${vars.eval_highlight_metric}"
deps:
- "evaluation/${vars.dataset}"

Expand Down
7 changes: 3 additions & 4 deletions benchmarks/nel/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
spacy @ git+https://github.com/rmitsch/spaCy.git@feature/candidate-generation-by-docs
pyyaml
tqdm
prettytable
scikit-learn
fuzzyset2
spacyfishing
virtualenv
virtualenv
spacyfishing
70 changes: 0 additions & 70 deletions benchmarks/nel/scripts/candidate_generation/base.py

This file was deleted.

53 changes: 0 additions & 53 deletions benchmarks/nel/scripts/candidate_generation/embeddings.py

This file was deleted.

Loading