feat(python/sdk): add support to list transcripts (#4824)

ploeber · ploeber · commit bb65815724a1 · 2024-07-19T16:54:05.000+02:00
GitOrigin-RevId: c39fd04618bf1e9548776d73d84f065f7f286a18
diff --git a/README.md b/README.md
@@ -192,6 +192,74 @@ print(transcript.text)
 
 </details>
 
+<details>
+  <summary>Upload a file</summary>
+
+```python
+import assemblyai as aai
+
+transcriber = aai.Transcriber()
+upload_url = transcriber.upload_file(data)
+```
+
+</details>
+
+<details>
+  <summary>Delete a transcript</summary>
+
+```python
+import assemblyai as aai
+
+transcript = aai.Transcriber().transcribe(audio_url)
+
+aai.Transcript.delete_by_id(transcript.id)
+```
+
+</details>
+
+<details>
+  <summary>List transcripts</summary>
+
+This returns a page of transcripts you created.
+
+```python
+import assemblyai as aai
+
+transcriber = aai.Transcriber()
+
+page = transcriber.list_transcripts()
+print(page.page_details)  # Page details
+print(page.transcripts)  # List of transcripts
+```
+
+You can apply filter parameters:
+
+```python
+params = aai.ListTranscriptParameters(
+    limit=3,
+    status=aai.TranscriptStatus.completed,
+)
+page = transcriber.list_transcripts(params)
+```
+
+You can also paginate over all pages by using the helper property `before_id_of_prev_url`.
+
+The `prev_url` always points to a page with older transcripts. If you extract the `before_id`
+of the `prev_url` query parameters, you can paginate over all pages from newest to oldest.
+
+```python
+transcriber = aai.Transcriber()
+
+params = aai.ListTranscriptParameters()
+
+page = transcriber.list_transcripts(params)
+while page.page_details.before_id_of_prev_url is not None:
+    params.before_id = page.page_details.before_id_of_prev_url
+    page = transcriber.list_transcripts(params)
+```
+
+</details>
+
 ---
 
 ### **LeMUR Examples**
diff --git a/assemblyai/__init__.py b/assemblyai/__init__.py
@@ -34,6 +34,9 @@
     LemurTaskResponse,
     LemurTranscriptSource,
     LemurUsage,
+    ListTranscriptParameters,
+    ListTranscriptResponse,
+    PageDetails,
     Paragraph,
     PIIRedactedAudioQuality,
     PIIRedactionPolicy,
@@ -57,6 +60,7 @@
     Timestamp,
     TranscriptError,
     TranscriptionConfig,
+    TranscriptItem,
     TranscriptStatus,
     Utterance,
     UtteranceWord,
@@ -103,6 +107,9 @@
     "LemurSummaryResponse",
     "LemurTaskResponse",
     "LemurUsage",
+    "ListTranscriptParameters",
+    "ListTranscriptResponse",
+    "PageDetails",
     "Sentence",
     "Sentiment",
     "SentimentType",
@@ -117,6 +124,7 @@
     "Transcript",
     "TranscriptError",
     "TranscriptGroup",
+    "TranscriptItem",
     "TranscriptStatus",
     "Utterance",
     "UtteranceWord",
diff --git a/assemblyai/__version__.py b/assemblyai/__version__.py
@@ -1 +1 @@
-__version__ = "0.29.0"
+__version__ = "0.31.0"
diff --git a/assemblyai/api.py b/assemblyai/api.py
@@ -247,6 +247,29 @@ def get_paragraphs(
     return types.ParagraphsResponse.parse_obj(response.json())
 
 
+def list_transcripts(
+    client: httpx.Client,
+    params: Optional[types.ListTranscriptParameters],
+) -> types.ListTranscriptResponse:
+    response = client.get(
+        ENDPOINT_TRANSCRIPT,
+        params=(
+            params.dict(
+                exclude_none=True,
+            )
+            if params
+            else None
+        ),
+    )
+
+    if response.status_code != httpx.codes.ok:
+        raise types.AssemblyAIError(
+            f"failed to retrieve transcripts: {_get_error_message(response)}"
+        )
+
+    return types.ListTranscriptResponse.parse_obj(response.json())
+
+
 def lemur_question(
     client: httpx.Client,
     request: types.LemurQuestionRequest,
diff --git a/assemblyai/transcriber.py b/assemblyai/transcriber.py
@@ -821,6 +821,12 @@ def transcribe_group(
 
         return transcript_group
 
+    def list_transcripts(
+        self,
+        params: Optional[types.ListTranscriptParameters],
+    ) -> types.ListTranscriptResponse:
+        return api.list_transcripts(client=self._client.http_client, params=params)
+
 
 class Transcriber:
     """
@@ -1036,6 +1042,45 @@ def transcribe_group_async(
             poll=True,
         )
 
+    def list_transcripts(
+        self,
+        params: Optional[types.ListTranscriptParameters] = None,
+    ) -> types.ListTranscriptResponse:
+        """
+        Retrieve a list of transcripts that were created. Transcripts are sorted from newest to oldest.
+
+        Args:
+            params: The parameters to filter the transcript list by.
+
+        Returns: A page with a list of transcripts along with page details.
+
+        To paginate over all pages, you can set the `ListTranscriptParameters.before_id`
+        to the `before_id` of the `prev_url`. Example:
+        ```
+        transcriber = aai.Transcriber()
+        params = aai.ListTranscriptParameters()
+        page = transcriber.list_transcripts(params)
+        while page.page_details.before_id_of_prev_url is not None:
+            params.before_id = page.page_details.before_id_of_prev_url
+            page = transcriber.list_transcripts(params)
+        ```
+        """
+        return self._impl.list_transcripts(params=params)
+
+    def list_transcripts_async(
+        self,
+        params: Optional[types.ListTranscriptParameters] = None,
+    ) -> concurrent.futures.Future[types.ListTranscriptResponse]:
+        """
+        Retrieve a list of transcripts that were created. Transcripts are sorted from newest to oldest.
+
+        Args:
+            params: The parameters to filter the transcript list by.
+
+        Returns: A page with a list of transcripts along with page details.
+        """
+        return self._executor.submit(self._impl.list_transcripts, params=params)
+
 
 class _RealtimeTranscriberImpl:
     def __init__(
diff --git a/assemblyai/types.py b/assemblyai/types.py
@@ -1,6 +1,7 @@
 from datetime import datetime
 from enum import Enum, EnumMeta
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple, Union
+from urllib.parse import parse_qs, urlparse
 from warnings import warn
 
 if TYPE_CHECKING:
@@ -1736,6 +1737,89 @@ def __init__(self, **data: Any):
         super().__init__(**data)
 
 
+class ListTranscriptParameters(BaseModel):
+    """
+    The query parameters when listing transcripts.
+    """
+
+    after_id: Optional[str]
+    "Get transcripts that were created after this transcript ID"
+
+    before_id: Optional[str]
+    "Get transcripts that were created before this transcript ID"
+
+    created_on: Optional[str]
+    "Get only transcripts created on this date"
+
+    limit: Optional[int]
+    "Maximum amount of transcripts to retrieve. Default is 10"
+
+    status: Optional[TranscriptStatus]
+    "Filter by transcript status"
+
+    throttled_only: Optional[bool]
+    "Get only throttled transcripts, overrides the status filter"
+
+    class Config:
+        use_enum_values = True  # Populate the enum value for the query parameters
+
+
+class PageDetails(BaseModel):
+    """
+    Details of the transcript page.
+    """
+
+    current_url: str
+    "The URL used to retrieve the current page of transcripts"
+
+    limit: int
+    "The number of results this page is limited to"
+
+    next_url: Optional[str]
+    "The URL to the next page of transcripts. The next URL always points to a page with newer transcripts."
+
+    prev_url: Optional[str]
+    "The URL to the next page of transcripts. The previous URL always points to a page with older transcripts."
+
+    result_count: int
+    "The actual number of results in the page"
+
+    @property
+    def before_id_of_prev_url(self) -> Optional[str]:
+        """
+        The `before_id` contained in the `prev_url` query params. Can be used as the
+        `ListTranscriptParameters.before_id` for the subsequent `list_transcripts()` call to paginate over results.
+        """
+        if not self.prev_url:
+            return None
+        parsed_query_params = parse_qs(urlparse(self.prev_url).query)
+        before_id_list = parsed_query_params.get("before_id")
+        return before_id_list[0] if before_id_list else None
+
+
+class TranscriptItem(BaseModel):
+    audio_url: str
+    completed: Optional[str]
+    created: str
+    error: Optional[str]
+    id: str
+    resource_url: str
+    status: TranscriptStatus
+
+
+class ListTranscriptResponse(BaseModel):
+    """
+    A list of returned transcripts along with page details.
+    Transcripts are sorted from newest to oldest. The previous URL always points to a page with older transcripts.
+    """
+
+    page_details: PageDetails
+    "Details of the returned transcript page"
+
+    transcripts: List[TranscriptItem]
+    "A list of transcripts sorted from newest to oldest"
+
+
 class LemurSourceType(str, Enum):
     """
     The source type of the LeMUR request
diff --git a/tests/unit/factories.py b/tests/unit/factories.py
@@ -204,6 +204,43 @@ class Meta:
         model = types.TranscriptRequest
 
 
+class PageDetails(factory.Factory):
+    class Meta:
+        model = types.PageDetails
+
+    current_url = factory.Faker("url")
+    limit = 10
+    next_url = None
+    prev_url = None
+    result_count = 2
+
+
+class TranscriptItem(factory.Factory):
+    class Meta:
+        model = types.TranscriptItem
+
+    audio_url = factory.Faker("url")
+    created = factory.Faker("iso8601")
+    id = factory.Faker("uuid4")
+    resource_url = factory.Faker("url")
+    status = aai.TranscriptStatus.completed
+    completed = None
+    error = None
+
+
+class ListTranscriptResponse(factory.Factory):
+    class Meta:
+        model = types.ListTranscriptResponse
+
+    page_details = factory.SubFactory(PageDetails)
+    transcripts = factory.List(
+        [
+            factory.SubFactory(TranscriptItem),
+            factory.SubFactory(TranscriptItem),
+        ]
+    )
+
+
 class LemurUsage(factory.Factory):
     class Meta:
         model = types.LemurUsage
diff --git a/tests/unit/test_transcriber.py b/tests/unit/test_transcriber.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.29.0"`
	`1`	`+__version__ = "0.31.0"`