Future-House · jamesbraza · Aug 9, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/src/paperqa/clients/__init__.py b/src/paperqa/clients/__init__.py
@@ -7,7 +7,7 @@
 
 import aiohttp
 from lmi.utils import gather_with_concurrency
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 
 from paperqa.types import Doc, DocDetails
 
@@ -36,21 +36,33 @@
 
 
 class DocMetadataTask(BaseModel):
-    """Holder for provider and processor tasks."""
+    """Simple container pairing metadata providers with processors."""
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    providers: Collection[MetadataProvider]
-    processors: Collection[MetadataPostProcessor]
+    providers: Collection[MetadataProvider] = Field(
+        description=(
+            "Metadata providers allotted to this task."
+            " An example would be providers for Crossref and Semantic Scholar."
+        )
+    )
+    processors: Collection[MetadataPostProcessor] = Field(
+        description=(
+            "Metadata post-processors allotted to this task."
+            " An example would be a journal quality filter."
+        )
+    )
 
     def provider_queries(
         self, query: dict
     ) -> list[Coroutine[Any, Any, DocDetails | None]]:
+        """Set up query coroutines for each contained metadata provider."""
         return [p.query(query) for p in self.providers]
 
     def processor_queries(
         self, doc_details: DocDetails, session: aiohttp.ClientSession
     ) -> list[Coroutine[Any, Any, DocDetails]]:
+        """Set up process coroutines for each contained metadata post-processor."""
         return [
             p.process(copy.copy(doc_details), session=session) for p in self.processors
         ]
@@ -78,7 +90,6 @@ def __init__(
                 if nested, will query in order looking for termination criteria after each.
                 Will terminate early if either DocDetails.is_hydration_needed is False OR if
                 all requested fields are present in the DocDetails object.
-
         """
         self._session = session
         self.tasks: list[DocMetadataTask] = []

diff --git a/src/paperqa/clients/client_models.py b/src/paperqa/clients/client_models.py
@@ -88,25 +88,28 @@ class JournalQuery(ClientQuery):
 
 
 class MetadataProvider(ABC, Generic[ClientQueryType]):
-    """Provide metadata from a query by any means necessary."""
+    """Provide metadata from a query by any means necessary.
+
+    An example is going from a DOI to full paper metadata using Semantic Scholar.
+    """
 
     async def query(self, query: dict) -> DocDetails | None:
-        return await self._query(self.query_transformer(query))
+        return await self._query(self.query_factory(query))
 
     @abstractmethod
     async def _query(self, query: ClientQueryType) -> DocDetails | None:
-        pass
+        """Run a query against the provider."""
 
     @abstractmethod
-    def query_transformer(self, query: dict) -> ClientQueryType:
-        pass
+    def query_factory(self, query: dict) -> ClientQueryType:
+        """Create a query object from unstructured query data."""
 
 
 class DOIOrTitleBasedProvider(MetadataProvider[DOIQuery | TitleAuthorQuery]):
 
     async def query(self, query: dict) -> DocDetails | None:
         try:
-            client_query = self.query_transformer(query)
+            client_query = self.query_factory(query)
             return await self._query(client_query)
         # We allow graceful failures, i.e. return "None" for both DOI errors and timeout errors
         # DOINotFoundError means the paper doesn't exist in the source, the timeout is to prevent
@@ -150,7 +153,7 @@ async def _query(self, query: DOIQuery | TitleAuthorQuery) -> DocDetails | None:
             TimeoutError: When the request takes too long on the client side
         """
 
-    def query_transformer(self, query: dict) -> DOIQuery | TitleAuthorQuery:
+    def query_factory(self, query: dict) -> DOIQuery | TitleAuthorQuery:
         try:
             if "doi" in query:
                 return DOIQuery(**query)
@@ -169,7 +172,6 @@ class MetadataPostProcessor(ABC, Generic[ClientQueryType]):
 
     MetadataPostProcessor should be idempotent and not order-dependent, i.e.
     all MetadataPostProcessor instances should be able to run in parallel.
-
     """
 
     async def process(self, doc_details: DocDetails, **kwargs) -> DocDetails: