remsky · kprinssu · Jan 18, 2025 · Mar 20, 2025 · Mar 22, 2025 · Mar 24, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -28,13 +28,17 @@ jobs:
   build-images:
     needs: prepare-release
     runs-on: ubuntu-latest
+    timeout-minutes: 60
     permissions:
       packages: write # Needed to push images to GHCR
     env:
       DOCKER_BUILDKIT: 1
       BUILDKIT_STEP_LOG_MAX_SIZE: 10485760
       # This environment variable will override the VERSION variable in docker-bake.hcl
       VERSION: ${{ needs.prepare-release.outputs.version_tag }} # Use tag version (vX.Y.Z) for bake
+    strategy:
+      matrix:
+        build_target: ["cpu", "cpu-arm64", "gpu-arm64", "gpu", "rocm"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -60,7 +64,7 @@ jobs:
           df -h
           echo "Cleaning up disk space..."
           sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache
-          docker system prune -af
+          sudo docker system prune -af
           echo "Disk space after cleanup"
           df -h
 
@@ -85,7 +89,7 @@ jobs:
         run: |
           echo "Building and pushing images for version ${{ needs.prepare-release.outputs.version_tag }}"
           # The VERSION env var above sets the tag for the bake file targets
-          docker buildx bake --push
+          docker buildx bake ${{ matrix.build_target }} --push
 
   create-release:
     needs: [prepare-release, build-images]

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.2.4
+0.2.4-2
diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py
@@ -497,4 +497,4 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st
 
     text = re.sub(r"\s{2,}", " ", text)
 
-    return text.strip()
+    return text
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
@@ -15,7 +15,7 @@
 # Pre-compiled regex patterns for performance
 # Updated regex to be more strict and avoid matching isolated brackets
 # Only matches complete patterns like [word](/ipa/) and prevents catastrophic backtracking
-CUSTOM_PHONEMES = re.compile(r"(\[[^\[\]]*?\])(\(\/[^\/\(\)]*?\/\))")
+CUSTOM_PHONEMES = re.compile(r"(\[[^\[\]]*?\]\(\/[^\/\(\)]*?\/\))")
 # Pattern to find pause tags like [pause:0.5s]
 PAUSE_TAG_PATTERN = re.compile(r"\[pause:(\d+(?:\.\d+)?)s\]", re.IGNORECASE)
 
@@ -100,7 +100,7 @@ def process_text(text: str, language: str = "a") -> List[int]:
 
 
 def get_sentence_info(
-    text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a"
+    text: str, lang_code: str = "a"
 ) -> List[Tuple[str, List[int], int]]:
     """Process all sentences and return info"""
     # Detect Chinese text
@@ -110,18 +110,10 @@ def get_sentence_info(
         sentences = re.split(r"([，。！？；])+", text)
     else:
         sentences = re.split(r"([.!?;:])(?=\s|$)", text)
-    phoneme_length, min_value = len(custom_phenomes_list), 0
 
     results = []
     for i in range(0, len(sentences), 2):
         sentence = sentences[i].strip()
-        for replaced in range(min_value, phoneme_length):
-            current_id = f"</|custom_phonemes_{replaced}|/>"
-            if current_id in sentence:
-                sentence = sentence.replace(
-                    current_id, custom_phenomes_list.pop(current_id)
-                )
-                min_value += 1
         punct = sentences[i + 1] if i + 1 < len(sentences) else ""
         if not sentence:
             continue
@@ -173,24 +165,23 @@ async def smart_split(
             # Strip leading and trailing spaces to prevent pause tag splitting artifacts
             text_part_raw = text_part_raw.strip()
 
-            # Apply the original smart_split logic to this text part
-            custom_phoneme_list = {}
-
             # Normalize text (original logic)
             processed_text = text_part_raw
             if settings.advanced_text_normalization and normalization_options.normalize:
                 if lang_code in ["a", "b", "en-us", "en-gb"]:
-                    processed_text = CUSTOM_PHONEMES.sub(
-                        lambda s: handle_custom_phonemes(s, custom_phoneme_list), processed_text
-                    )
-                    processed_text = normalize_text(processed_text, normalization_options)
+                    processed_text = CUSTOM_PHONEMES.split(processed_text)
+                    for index in range(0, len(processed_text), 2):
+                        processed_text[index] = normalize_text(processed_text[index], normalization_options)
+
+
+                    processed_text = "".join(processed_text).strip()
                 else:
                     logger.info(
                         "Skipping text normalization as it is only supported for english"
                     )
 
             # Process all sentences (original logic)
-            sentences = get_sentence_info(processed_text, custom_phoneme_list, lang_code=lang_code)
+            sentences = get_sentence_info(processed_text, lang_code=lang_code)
 
             current_chunk = []
             current_tokens = []

diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py
@@ -34,7 +34,7 @@ def test_process_text_chunk_phonemes():
 def test_get_sentence_info():
     """Test sentence splitting and info extraction."""
     text = "This is sentence one. This is sentence two! What about three?"
-    results = get_sentence_info(text, {})
+    results = get_sentence_info(text)
 
     assert len(results) == 3
     for sentence, tokens, count in results:
@@ -44,24 +44,6 @@ def test_get_sentence_info():
         assert count == len(tokens)
         assert count > 0
 
-
-def test_get_sentence_info_phenomoes():
-    """Test sentence splitting and info extraction."""
-    text = (
-        "This is sentence one. This is </|custom_phonemes_0|/> two! What about three?"
-    )
-    results = get_sentence_info(text, {"</|custom_phonemes_0|/>": r"sˈɛntᵊns"})
-
-    assert len(results) == 3
-    assert "sˈɛntᵊns" in results[1][0]
-    for sentence, tokens, count in results:
-        assert isinstance(sentence, str)
-        assert isinstance(tokens, list)
-        assert isinstance(count, int)
-        assert count == len(tokens)
-        assert count > 0
-
-
 @pytest.mark.asyncio
 async def test_smart_split_short_text():
     """Test smart splitting with text under max tokens."""
@@ -74,6 +56,33 @@ async def test_smart_split_short_text():
     assert isinstance(chunks[0][0], str)
     assert isinstance(chunks[0][1], list)
 
+@pytest.mark.asyncio
+async def test_smart_custom_phenomes():
+    """Test smart splitting with text under max tokens."""
+    text = "This is a short test sentence. [Kokoro](/kˈOkəɹO/) has a feature called custom phenomes. This is made possible by [Misaki](/misˈɑki/), the custom phenomizer that [Kokoro](/kˈOkəɹO/) version 1.0 uses"
+    chunks = []
+    async for chunk_text, chunk_tokens, pause_duration in smart_split(text):
+        chunks.append((chunk_text, chunk_tokens, pause_duration))
+
+    # Should have 1 chunks: text
+    assert len(chunks) == 1
+
+    # First chunk: text
+    assert chunks[0][2] is None  # No pause
+    assert "This is a short test sentence. [Kokoro](/kˈOkəɹO/) has a feature called custom phenomes. This is made possible by [Misaki](/misˈɑki/), the custom phenomizer that [Kokoro](/kˈOkəɹO/) version one uses" in chunks[0][0]
+    assert len(chunks[0][1]) > 0
+
+@pytest.mark.asyncio
+async def test_smart_split_only_phenomes():
+    """Test input that is entirely made of phenome annotations."""
+    text = "[Kokoro](/kˈOkəɹO/) [Misaki 1.2](/misˈɑki/) [Test](/tɛst/)"
+    chunks = []
+    async for chunk_text, chunk_tokens, pause_duration in smart_split(text, max_tokens=10):
+        chunks.append((chunk_text, chunk_tokens, pause_duration))
+
+    assert len(chunks) == 1
+    assert "[Kokoro](/kˈOkəɹO/) [Misaki 1.2](/misˈɑki/) [Test](/tɛst/)" in chunks[0][0]
+
 
 @pytest.mark.asyncio
 async def test_smart_split_long_text():
@@ -116,7 +125,7 @@ def test_process_text_chunk_chinese_phonemes():
 def test_get_sentence_info_chinese():
     """Test Chinese sentence splitting and info extraction."""
     text = "这是一个句子。这是第二个句子！第三个问题？"
-    results = get_sentence_info(text, {}, lang_code="z")
+    results = get_sentence_info(text, lang_code="z")
 
     assert len(results) == 3
     for sentence, tokens, count in results:

diff --git a/docker-bake.hcl b/docker-bake.hcl
@@ -40,10 +40,25 @@ target "_gpu_base" {
     dockerfile = "docker/gpu/Dockerfile"
 }
 
+# Base settings for AMD ROCm builds
+target "_rocm_base" {
+    inherits = ["_common"]
+    dockerfile = "docker/rocm/Dockerfile"
+}
+
 # CPU target with multi-platform support
 target "cpu" {
     inherits = ["_cpu_base"]
-    platforms = ["linux/amd64", "linux/arm64"]
+    platforms = ["linux/amd64"]
+    tags = [
+        "${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
+        "${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
+    ]
+}
+
+target "cpu-arm64" {
+    inherits = ["_cpu_base"]
+    platforms = ["linux/arm64"]
     tags = [
         "${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
         "${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
@@ -53,16 +68,51 @@ target "cpu" {
 # GPU target with multi-platform support
 target "gpu" {
     inherits = ["_gpu_base"]
-    platforms = ["linux/amd64", "linux/arm64"]
+    platforms = ["linux/amd64"]
     tags = [
         "${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
         "${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
     ]
 }
 
-# Default group to build both CPU and GPU versions
-group "default" {
-    targets = ["cpu", "gpu"]
+target "gpu-arm64" {
+    inherits = ["_gpu_base"]
+    platforms = ["linux/arm64"]
+    tags = [
+        "${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
+        "${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
+    ]
+}
+
+# AMD ROCm target with multi-platform support
+target "rocm" {
+    inherits = ["_rocm_base"]
+    platforms = ["linux/amd64"]
+    tags = [
+        "${REGISTRY}/${OWNER}/${REPO}-rocm:${VERSION}",
+        "${REGISTRY}/${OWNER}/${REPO}-rocm:latest"
+    ]
+}
+
+# Build groups for parallel builds
+group "cpu" {
+    targets = ["cpu"]
+}
+
+group "cpu-arm64" {
+    targets = ["cpu-arm64"]
+}
+
+group "gpu-arm64" {
+    targets = ["gpu-arm64"]
+}
+
+group "gpu" {
+    targets = ["gpu"]
+}
+
+group "rocm" {
+    targets = ["rocm"]
 }
 
 # Development targets for faster local builds
@@ -78,6 +128,12 @@ target "gpu-dev" {
     tags = ["${REGISTRY}/${OWNER}/${REPO}-gpu:dev"]
 }
 
+target "rocm-dev" {
+    inherits = ["_rocm_base"]
+    # No multi-platform for dev builds
+    tags = ["${REGISTRY}/${OWNER}/${REPO}-rocm:dev"]
+}
+
 group "dev" {
-    targets = ["cpu-dev", "gpu-dev"]
-}
+    targets = ["cpu-dev", "gpu-dev", "rocm-dev"]
+}
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
@@ -1,26 +1,17 @@
 FROM python:3.10-slim 
 
 # Install dependencies and check espeak location
-RUN apt-get update && apt-get install -y \
-    espeak-ng \
-    espeak-ng-data \
-    git \
-    libsndfile1 \
-    curl \
-    ffmpeg \
-    g++ \
-&& apt-get clean \
-&& rm -rf /var/lib/apt/lists/* \
-&& mkdir -p /usr/share/espeak-ng-data \
-&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/
-
-# Install UV using the installer script
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+# Rust is required to build sudachipy and pyopenjtalk-plus
+RUN apt-get update -y &&  \
+    apt-get install -y espeak-ng espeak-ng-data git libsndfile1 curl ffmpeg g++ && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    mkdir -p /usr/share/espeak-ng-data && \
+    ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ && \
+    curl -LsSf https://astral.sh/uv/install.sh | sh && \
     mv /root/.local/bin/uv /usr/local/bin/ && \
-    mv /root/.local/bin/uvx /usr/local/bin/
-
-# Create non-root user and set up directories and permissions
-RUN useradd -m -u 1000 appuser && \
+    mv /root/.local/bin/uvx /usr/local/bin/ && \
+    curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    useradd -m -u 1000 appuser && \
     mkdir -p /app/api/src/models/v1_0 && \
     chown -R appuser:appuser /app
 
@@ -30,14 +21,9 @@ WORKDIR /app
 # Copy dependency files
 COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml
 
-# Install Rust (required to build sudachipy and pyopenjtalk-plus)
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="/home/appuser/.cargo/bin:$PATH"
-
-# Install dependencies
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv venv --python 3.10 && \
-    uv sync --extra cpu
+# Install dependencies with CPU extras
+RUN uv venv --python 3.10 && \
+    uv sync --extra cpu --no-cache
 
 # Copy project files including models
 COPY --chown=appuser:appuser api ./api
@@ -46,21 +32,21 @@ COPY --chown=appuser:appuser docker/scripts/ ./
 RUN chmod +x ./entrypoint.sh
 
 # Set environment variables
-ENV PYTHONUNBUFFERED=1 \
+ENV PATH="/home/appuser/.cargo/bin:/app/.venv/bin:$PATH" \
+    PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app:/app/api \
-    PATH="/app/.venv/bin:$PATH" \
     UV_LINK_MODE=copy \
     USE_GPU=false \
     PHONEMIZER_ESPEAK_PATH=/usr/bin \
     PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
-    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
+    ESPEAK_DATA_PATH=/usr/share/espeak-ng-data \
+    DEVICE="cpu"
 
 ENV DOWNLOAD_MODEL=true
 # Download model if enabled
 RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
     python download_model.py --output api/src/models/v1_0; \
     fi
 
-ENV DEVICE="cpu"
 # Run FastAPI server through entrypoint.sh
 CMD ["./entrypoint.sh"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -497,4 +497,4 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st

		text = re.sub(r"\s{2,}", " ", text)

		return text.strip()
		return text