Add RawTask executing docs ad verbatim (#395)

* Add context length info. Refactor BuiltinTask and models to facilitate this. * Add token count estimator plumbing. * Add plumbing for mapper and reducer. * Add ShardMapper prototype. * Integrating mapping into prompt generation workflow. * Update response parsing and component to support sharding (WIP). * Fix shard & prompt flow. * Fix shard & prompt flow. * Remove todo comments. * Fix Anthropic, Cohere, NoOp model tests. * Fix test_llm_pipe(). * Fix type checking test. * Fix span parsing tests. * Fix internal tests. * Fix _CountTask. * Fix sentiment and summarization tasks and tests. * Fix Azure connection URL. Fix Model test pings. * Fix Lemma parsing. * Start work on doc-to-shard property copying. * Fix REL doc preprocessing. * Remove comment on doc attribute handling during sharding, as this is done by spaCy's slicing directly. * Add reducer implementations. * Implement outstanding task reducers. * Add shardable/non-shardable LLM task typing distinction. Add support for handling both types of tasks. Update tests. * Fix EL task. * Fix EL tokenization and highlighting partially. * Fix tokenization and whitespaces for EL task. * Add new registry handlers (with context length and arbitrary model names) for all REST models. * Add sharding test with simple count task. * Fix sharding algorithm. * Add test with simple count task. * Add context length as init arg in HF models. * Fix tests. Don't stringify IO lists if sharded. * Fix tests. * Add NER sharding test. * Add REL and sentiment sharding tests. * Add summary sharding tests. * Add EL sharding task. Fix bug in shard mapper. * Fix REL error with RELExample parsing. * Use regex for punctuation in REL conversion. * Maintain custom doc attributes, incl. test. * Filter merge warnings in textcat reduction. * Fix custom doc data merging. * Add RawTask. * Fix task version. * Add sharding test. * Update spacy_llm/models/langchain/model.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy_llm/pipeline/llm.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Incorporate feedback. * Move sharding compatibility warning to component constructor. * Update spacy_llm/tasks/entity_linker/util.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy_llm/models/hf/base.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Incorporate feedback. * Update spacy_llm/tasks/raw/registry.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fix tests. * Remove boilerplate text in raw template. * Fix sharding test. --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
explosion · Dec 11, 2023 · 7c533cd · 7c533cd
1 parent a6515bf
commit 7c533cd
Show file tree

Hide file tree

Showing 16 changed files with 557 additions and 3 deletions.
diff --git a/spacy_llm/cache.py b/spacy_llm/cache.py
@@ -72,7 +72,7 @@ def initialize(self, vocab: Vocab, task: LLMTask) -> None:
         """
         Initialize cache with data not available at construction time.
         vocab (Vocab): Vocab object.
-        task (LLMTask): Task.
+        task (ShardingLLMTask): Task.
         """
         self._vocab = vocab
         if isinstance(task, PromptTemplateProvider):

diff --git a/spacy_llm/tasks/__init__.py b/spacy_llm/tasks/__init__.py
@@ -7,6 +7,7 @@
 from .lemma import LemmaTask, make_lemma_task
 from .ner import NERTask, make_ner_task_v3
 from .noop import NoopTask, ShardingNoopTask, make_noop_task, make_noopnoshards_task
+from .raw import RawTask, make_raw_task
 from .rel import RELTask, make_rel_task
 from .sentiment import SentimentTask, make_sentiment_task
 from .spancat import SpanCatTask, make_spancat_task_v3
@@ -16,6 +17,7 @@
 _LATEST_TASKS = (
     "spacy.EntityLinker.v1",
     "spacy.NER.v3",
+    "spacy.Raw.v1",
     "spacy.REL.v1",
     "spacy.Sentiment.v1",
     "spacy.SpanCat.v3",
@@ -43,6 +45,7 @@
     "make_ner_task_v3",
     "make_noop_task",
     "make_noopnoshards_task",
+    "make_raw_task",
     "make_rel_task",
     "make_sentiment_task",
     "make_spancat_task_v3",
@@ -53,6 +56,7 @@
     "LemmaTask",
     "NERTask",
     "NoopTask",
+    "RawTask",
     "RELTask",
     "SentimentTask",
     "ShardingNoopTask",

diff --git a/spacy_llm/tasks/lemma/registry.py b/spacy_llm/tasks/lemma/registry.py
@@ -41,7 +41,6 @@ def make_lemma_task(
     prompt_example_type (Optional[Type[FewshotExample]]): Type to use for fewshot examples.
     examples (ExamplesConfigType): Optional callable that reads a file containing task examples for
         few-shot learning. If None is passed, then zero-shot learning will be used.
-    n_token_estimator (Optional[NTokenEstimator]): Estimates number of tokens in a string.
     shard_mapper (Optional[ShardMapper]): Maps docs to shards if they don't fit into the model context.
     shard_reducer (Optional[ShardReducer]): Reduces doc shards back into one doc instance.
     scorer (Optional[Scorer]): Scorer function.

diff --git a/spacy_llm/tasks/lemma/util.py b/spacy_llm/tasks/lemma/util.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any, Dict, Iterable, List, Optional
 
 from spacy.scorer import Scorer
@@ -34,4 +35,10 @@ def reduce_shards_to_doc(task: LemmaTask, shards: Iterable[Doc]) -> Doc:
     RETURNS (Doc): Fused doc instance.
     """
     # Lemmas are token-specific, so we can just merge shards.
-    return Doc.from_docs(list(shards), ensure_whitespace=True)
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            category=UserWarning,
+            message=".*Skipping .* while merging docs.",
+        )
+        return Doc.from_docs(list(shards), ensure_whitespace=True)
diff --git a/spacy_llm/tasks/raw/__init__.py b/spacy_llm/tasks/raw/__init__.py
@@ -0,0 +1,5 @@
+from .registry import make_raw_task
+from .task import RawTask
+from .util import RawExample
+
+__all__ = ["make_raw_task", "RawExample", "RawTask"]
diff --git a/spacy_llm/tasks/raw/parser.py b/spacy_llm/tasks/raw/parser.py
@@ -0,0 +1,19 @@
+from typing import Iterable, List
+
+from spacy.tokens import Doc
+
+from .task import RawTask
+
+
+def parse_responses_v1(
+    task: RawTask, shards: Iterable[Iterable[Doc]], responses: Iterable[Iterable[str]]
+) -> Iterable[List[str]]:
+    """Parses LLM responses for spacy.Raw.v1. Note that no parsing happens here, as we don't know what the result is
+        expected to look like.
+    task (RawTask): Task instance.
+    shards (Iterable[Iterable[Doc]]): Doc shards.
+    responses (Iterable[Iterable[str]]): LLM responses.
+    RETURNS (Iterable[List[str]]): Reply as string per shard and doc.
+    """
+    for responses_for_doc in responses:
+        yield list(responses_for_doc)
diff --git a/spacy_llm/tasks/raw/registry.py b/spacy_llm/tasks/raw/registry.py
@@ -0,0 +1,55 @@
+from typing import Optional, Type
+
+from ...registry import registry
+from ...ty import ExamplesConfigType, FewshotExample, ShardMapper, ShardReducer
+from ...ty import TaskResponseParser
+from ..util.sharding import make_shard_mapper
+from .parser import parse_responses_v1
+from .task import DEFAULT_RAW_TEMPLATE_V1, RawTask
+from .util import RawExample, reduce_shards_to_doc
+
+
+@registry.llm_misc("spacy.RawParser.v1")
+def make_raw_parser() -> TaskResponseParser[RawTask]:
+    return parse_responses_v1
+
+
+@registry.llm_misc("spacy.RawShardReducer.v1")
+def make_shard_reducer() -> ShardReducer:
+    return reduce_shards_to_doc
+
+
+@registry.llm_tasks("spacy.Raw.v1")
+def make_raw_task(
+    template: str = DEFAULT_RAW_TEMPLATE_V1,
+    field: str = "llm_reply",
+    parse_responses: Optional[TaskResponseParser[RawTask]] = None,
+    prompt_example_type: Optional[Type[FewshotExample]] = None,
+    examples: ExamplesConfigType = None,
+    shard_mapper: Optional[ShardMapper] = None,
+    shard_reducer: Optional[ShardReducer] = None,
+):
+    """Raw.v1 task factory.
+
+    template (str): Prompt template passed to the model.
+    field (str): Field to store replies in.
+    parse_responses (Optional[TaskResponseParser]): Callable for parsing LLM responses for this task.
+    prompt_example_type (Optional[Type[FewshotExample]]): Type to use for fewshot examples.
+    examples (ExamplesConfigType): Optional callable that reads a file containing task examples for
+        few-shot learning. If None is passed, then zero-shot learning will be used.
+    shard_mapper (Optional[ShardMapper]): Maps docs to shards if they don't fit into the model context.
+    shard_reducer (Optional[ShardReducer]): Reduces doc shards back into one doc instance.
+    """
+    raw_examples = examples() if callable(examples) else examples
+    example_type = prompt_example_type or RawExample
+    raw_examples = [example_type(**eg) for eg in raw_examples] if raw_examples else None
+
+    return RawTask(
+        template=template,
+        field=field,
+        parse_responses=parse_responses or parse_responses_v1,
+        prompt_example_type=example_type,
+        prompt_examples=raw_examples,
+        shard_mapper=shard_mapper or make_shard_mapper(),
+        shard_reducer=shard_reducer or make_shard_reducer(),
+    )
diff --git a/spacy_llm/tasks/raw/task.py b/spacy_llm/tasks/raw/task.py
@@ -0,0 +1,86 @@
+from typing import Callable, Iterable, List, Optional, Type
+
+from spacy import Language
+from spacy.tokens import Doc
+from spacy.training import Example
+
+from ...compat import Self
+from ...ty import FewshotExample, ShardMapper, ShardReducer, TaskResponseParser
+from ..builtin_task import BuiltinTask
+from ..templates import read_template
+
+DEFAULT_RAW_TEMPLATE_V1 = read_template("raw.v1")
+
+
+class RawTask(BuiltinTask):
+    def __init__(
+        self,
+        parse_responses: TaskResponseParser[Self],
+        prompt_example_type: Type[FewshotExample[Self]],
+        prompt_examples: Optional[List[FewshotExample[Self]]],
+        template: str,
+        field: str,
+        shard_mapper: ShardMapper,
+        shard_reducer: ShardReducer[Self],
+    ):
+        """Raw task. Expects prompt template without instructions for LLM, i. e. docs have to provide instructions
+            themselves.
+
+        parse_responses (TaskResponseParser[Self]): Callable for parsing LLM responses for this task.
+        prompt_example_type (Type[FewshotExample[Self]): Type to use for fewshot examples.
+        prompt_examples (Optional[List[FewshotExample[Self]]]): Optional list of few-shot examples to include in prompts.
+        template (str): Prompt template passed to the model.
+        field (str): Field to store responses in.
+        shard_mapper (ShardMapper): Maps docs to shards if they don't fit into the model context.
+        shard_reducer (ShardReducer[Self]): Reduces doc shards back into one doc instance.
+        """
+        super().__init__(
+            parse_responses=parse_responses,
+            prompt_example_type=prompt_example_type,
+            template=template,
+            prompt_examples=prompt_examples,
+            shard_mapper=shard_mapper,
+            shard_reducer=shard_reducer,
+        )
+        self._field = field
+        self._check_doc_extension()
+
+    def parse_responses(
+        self, shards: Iterable[Iterable[Doc]], responses: Iterable[Iterable[str]]
+    ) -> Iterable[Doc]:
+        shards_teed = self._tee_2d_iterable(shards, 2)
+        for shards_for_doc, responses_for_doc in zip(
+            shards_teed[0], self._parse_responses(self, shards_teed[1], responses)
+        ):
+            updated_shards_for_doc: List[Doc] = []
+            for shard, response in zip(shards_for_doc, responses_for_doc):
+                setattr(shard._, self._field, response)
+                updated_shards_for_doc.append(shard)
+
+            yield self._shard_reducer(self, updated_shards_for_doc)  # type: ignore[arg-type]
+
+    def initialize(
+        self,
+        get_examples: Callable[[], Iterable["Example"]],
+        nlp: Language,
+        n_prompt_examples: int = 0,
+    ) -> None:
+        super()._initialize(
+            get_examples=get_examples, nlp=nlp, n_prompt_examples=n_prompt_examples
+        )
+
+    def _check_doc_extension(self):
+        """Add extension if need be."""
+        if not Doc.has_extension(self._field):
+            Doc.set_extension(self._field, default=None)
+
+    @property
+    def _cfg_keys(self) -> List[str]:
+        return ["_template"]
+
+    @property
+    def field(self) -> str:
+        """Return field used to store replies in docs.
+        RETURNS (str): Field used to store replies in docs.
+        """
+        return self._field
diff --git a/spacy_llm/tasks/raw/util.py b/spacy_llm/tasks/raw/util.py
@@ -0,0 +1,44 @@
+import warnings
+from typing import Iterable, Optional
+
+from spacy.tokens import Doc
+from spacy.training import Example
+
+from ...compat import Self
+from ...ty import FewshotExample
+from .task import RawTask
+
+
+class RawExample(FewshotExample[RawTask]):
+    text: str
+    reply: str
+
+    @classmethod
+    def generate(cls, example: Example, task: RawTask) -> Optional[Self]:
+        return cls(
+            text=example.reference.text, reply=getattr(example.reference._, task.field)
+        )
+
+
+def reduce_shards_to_doc(task: RawTask, shards: Iterable[Doc]) -> Doc:
+    """Reduces shards to docs for RawTask.
+    task (RawTask): Task.
+    shards (Iterable[Doc]): Shards to reduce to single doc instance.
+    RETURNS (Doc): Fused doc instance.
+    """
+    shards = list(shards)
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            category=UserWarning,
+            message=".*Skipping .* while merging docs.",
+        )
+        doc = Doc.from_docs(shards, ensure_whitespace=True)
+        setattr(
+            doc._,
+            task.field,
+            " ".join([getattr(shard._, task.field) for shard in shards]),
+        )
+
+    return doc
diff --git a/spacy_llm/tasks/templates/raw.v1.jinja b/spacy_llm/tasks/templates/raw.v1.jinja
@@ -0,0 +1,17 @@
+{%- if prompt_examples -%}
+Below are some examples (only use these as a guide):
+{# whitespace #}
+{%- for example in prompt_examples -%}
+{# whitespace #}
+Text:
+{{ example.text }}
+Reply:
+{{ example.reply }}
+{# whitespace #}
+{%- endfor -%}
+{# whitespace #}
+{%- endif -%}
+{# whitespace #}
+Text:
+{{ text }}
+Reply:
diff --git a/spacy_llm/tests/sharding/test_sharding.py b/spacy_llm/tests/sharding/test_sharding.py
@@ -282,3 +282,25 @@ def test_sharding_entity_linker(config):
     assert all([ent.kb_id_ != EntityLinker.NIL for ent in doc.ents])
     assert prompts == ["Alice goes to *Boston* to ", "see the *Boston Celtics* game."]
     assert len(doc.user_data["llm_io"]["llm"]["response"]) == 2
+
+
+@pytest.mark.external
+@pytest.mark.skipif(has_openai_key is False, reason="OpenAI API key not available")
+def test_sharding_raw(config):
+    context_length = 20
+    config["components"]["llm"]["model"]["context_length"] = context_length
+    config["components"]["llm"]["task"] = {"@llm_tasks": "spacy.Raw.v1"}
+    nlp = assemble_from_config(config)
+
+    doc = nlp(_TEXT)
+    marker = "Text:\n"
+    prompts = [
+        pr[pr.rindex(marker) + len(marker) : pr.rindex("\nReply:")]
+        for pr in doc.user_data["llm_io"]["llm"]["prompt"]
+    ]
+    assert hasattr(doc._, "llm_reply") and doc._.llm_reply
+    assert prompts == [
+        "Do one thing every day that scares you. The ",
+        "only thing we have to fear is fear itself.",
+    ]
+    assert len(doc.user_data["llm_io"]["llm"]["response"]) == 2
diff --git a/spacy_llm/tests/tasks/examples/raw.json b/spacy_llm/tests/tasks/examples/raw.json
@@ -0,0 +1,5 @@
+[
+  {"text": "3 + 5 = x. What's x?", "reply": "8"},
+  {"text": "Write me a limerick.", "reply": "There was an Old Man with a beard, Who said, 'It is just as I feared! Two Owls and a Hen, Four Larks and a Wren, Have all built their nests in my beard!"},
+  {"text": "Analyse the sentiment of the text 'This is great'.", "reply": "'This is great' expresses a very positive sentiment."}
+]
diff --git a/spacy_llm/tests/tasks/examples/raw.jsonl b/spacy_llm/tests/tasks/examples/raw.jsonl
@@ -0,0 +1,3 @@
+{"text": "3 + 5 = x. What's x?", "reply": "8"}
+{"text": "Write me a limerick.", "reply": "There was an Old Man with a beard, Who said, 'It is just as I feared! Two Owls and a Hen, Four Larks and a Wren, Have all built their nests in my beard!"}
+{"text": "Analyse the sentiment of the text 'This is great'.", "reply": "'This is great' expresses a very positive sentiment."}
diff --git a/spacy_llm/tests/tasks/examples/raw.yml b/spacy_llm/tests/tasks/examples/raw.yml
@@ -0,0 +1,8 @@
+- text: "3 + 5 = x. What's x?"
+  reply: "8"
+
+- text: "Write me a limerick."
+  reply: "There was an Old Man with a beard, Who said, 'It is just as I feared! Two Owls and a Hen, Four Larks and a Wren, Have all built their nests in my beard!"
+
+- text: "Analyse the sentiment of the text 'This is great'."
+  reply: "'This is great' expresses a very positive sentiment."
diff --git a/spacy_llm/tests/tasks/templates/raw.jinja2 b/spacy_llm/tests/tasks/templates/raw.jinja2
@@ -0,0 +1,2 @@
+This is a test RAW template.
+Here is the text: {{ text }}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		This is a test RAW template.
		Here is the text: {{ text }}