Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
b064066
Add experimental rocm support
Jan 18, 2025
1d01755
Fix merge conflicts
kprinssu Mar 20, 2025
aba2628
Update Dockerfile to build against newer code
kprinssu Mar 22, 2025
a665cfc
Merge branch 'remsky:master' into master
kprinssu Mar 24, 2025
4f149ab
Use my fork of Kokoro
kprinssu Mar 25, 2025
3db522b
Merge branch 'master' of github.com:kprinssu/Kokoro-FastAPI
kprinssu Mar 25, 2025
2185b1b
Merge branch 'remsky:master' into master
kprinssu Mar 27, 2025
e9d9921
Merge branch 'remsky:master' into master
kprinssu Mar 30, 2025
19ac239
Add notes about performance inside docker compose
kprinssu Mar 31, 2025
9e595d9
Revert to upstream Kokoro
kprinssu Mar 31, 2025
94e5305
Remove casual_conv1d
kprinssu Apr 1, 2025
b19fd1d
Fix merge conflicts
kprinssu Apr 5, 2025
105b96e
Update to ROCm 6.4 and use uv to install dependencies
kprinssu Apr 14, 2025
e464b3a
Merge branch 'remsky:release' into release
kprinssu Apr 14, 2025
cec37ba
Remove uv.lock
kprinssu Apr 14, 2025
5696c5d
Use default versioning for numpy
kprinssu Apr 14, 2025
f248658
Merge branch 'release' of github.com:kprinssu/Kokoro-FastAPI into rel…
kprinssu Apr 14, 2025
c5086c7
Merge branch 'remsky:master' into master
kprinssu Apr 28, 2025
c1214b7
Merge remote-tracking branch 'origin/master' into release
kprinssu Apr 30, 2025
32191c6
Use ROCm 6.4 and smaller base Docker image
kprinssu Apr 30, 2025
9e330c6
Update training notes
kprinssu May 2, 2025
800bd2f
Merge branch 'remsky:master' into master
kprinssu May 21, 2025
6b1e9d9
Merge pull request #339 from fireblade2534/master
fireblade2534 Jun 18, 2025
0241423
Merge branch 'remsky:master' into master
kprinssu Jun 21, 2025
20e6281
Merge branch 'remsky:release' into release
kprinssu Jun 21, 2025
986cf04
Update to ROCm 6.4.1
kprinssu Jun 21, 2025
2a6d2ae
Fix custom phenomes and make them more robust
fireblade2534 Jun 26, 2025
cab2901
Merge branch 'remsky:master' into master
fireblade2534 Jun 26, 2025
8a55cd5
Update torch to 2.7.1 & Cuda 12.8.1 in Docker
MiggiV2 Jun 27, 2025
f8c8916
Merge pull request #350 from fireblade2534/master
fireblade2534 Jun 27, 2025
6805c8f
Merge branch 'remsky:master' into master
kprinssu Jun 29, 2025
a109a33
Merge remote-tracking branch 'origin/master' into release
kprinssu Jun 29, 2025
ce3cd62
Add ROCm Dockerfile to docker-bake.hcl
kprinssu Jun 29, 2025
5622fd4
Clean up more disk space
kprinssu Jun 29, 2025
105cc67
Disable ROCm arm64 builds
kprinssu Jun 29, 2025
379e858
Split up builds into a matrix
kprinssu Jun 29, 2025
35fd504
Fix typo
kprinssu Jun 29, 2025
ad65800
Make myself the repo owner
kprinssu Jun 29, 2025
fae42d3
Enable builds for master
kprinssu Jun 29, 2025
2ad4467
Revert "Make myself the repo owner"
kprinssu Jun 29, 2025
1d6d455
Minor revert to clean up disk space step
kprinssu Jun 29, 2025
130e6d9
Increase build step timeout
kprinssu Jun 29, 2025
726803a
Split arm64 and x86_64 builds
kprinssu Jun 29, 2025
1aaf004
Update pipelines to also build arm64
kprinssu Jun 29, 2025
92505e3
Fix arm64 cpu build target
kprinssu Jun 29, 2025
70b9521
Revert changes to pyproject toml to use pytorch nightly
kprinssu Jun 30, 2025
7e9108e
Revert changes to pyproject toml to use pytorch nightly
kprinssu Jun 30, 2025
67ac35f
Split up arm64 builds into their own targets
kprinssu Jun 30, 2025
812a72d
Bump version to 0.2.4-1
kprinssu Jun 30, 2025
84f5cc4
Reduce bloat in ROCm Dockerfile
kprinssu Jun 30, 2025
29066f7
Merge pull request #354 from MiggiV2/master
fireblade2534 Jul 4, 2025
acc671f
Smaller docker image:
faltiska Jul 12, 2025
bbb6ab6
Merge pull request #361 from faltiska/smaller-docker-image
fireblade2534 Jul 29, 2025
ef20f78
Merge branch 'remsky:master' into master
kprinssu Aug 10, 2025
5528a5a
Update Dockerfile to ROCm 6.4.3 and use newly released Pytorch 2.8
kprinssu Aug 10, 2025
dc2601b
Install missing rocrand package
kprinssu Aug 11, 2025
7e6d339
Fix merge conflicts
kprinssu Aug 11, 2025
b3a3af6
Clean up pyproject with duplicate indices
kprinssu Aug 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ jobs:
build-images:
needs: prepare-release
runs-on: ubuntu-latest
timeout-minutes: 60
permissions:
packages: write # Needed to push images to GHCR
env:
DOCKER_BUILDKIT: 1
BUILDKIT_STEP_LOG_MAX_SIZE: 10485760
# This environment variable will override the VERSION variable in docker-bake.hcl
VERSION: ${{ needs.prepare-release.outputs.version_tag }} # Use tag version (vX.Y.Z) for bake
strategy:
matrix:
build_target: ["cpu", "cpu-arm64", "gpu-arm64", "gpu", "rocm"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -60,7 +64,7 @@ jobs:
df -h
echo "Cleaning up disk space..."
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache
docker system prune -af
sudo docker system prune -af
echo "Disk space after cleanup"
df -h

Expand All @@ -85,7 +89,7 @@ jobs:
run: |
echo "Building and pushing images for version ${{ needs.prepare-release.outputs.version_tag }}"
# The VERSION env var above sets the tag for the bake file targets
docker buildx bake --push
docker buildx bake ${{ matrix.build_target }} --push

create-release:
needs: [prepare-release, build-images]
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.4
0.2.4-2
2 changes: 1 addition & 1 deletion api/src/services/text_processing/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,4 +497,4 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st

text = re.sub(r"\s{2,}", " ", text)

return text.strip()
return text
27 changes: 9 additions & 18 deletions api/src/services/text_processing/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# Pre-compiled regex patterns for performance
# Updated regex to be more strict and avoid matching isolated brackets
# Only matches complete patterns like [word](/ipa/) and prevents catastrophic backtracking
CUSTOM_PHONEMES = re.compile(r"(\[[^\[\]]*?\])(\(\/[^\/\(\)]*?\/\))")
CUSTOM_PHONEMES = re.compile(r"(\[[^\[\]]*?\]\(\/[^\/\(\)]*?\/\))")
# Pattern to find pause tags like [pause:0.5s]
PAUSE_TAG_PATTERN = re.compile(r"\[pause:(\d+(?:\.\d+)?)s\]", re.IGNORECASE)

Expand Down Expand Up @@ -100,7 +100,7 @@ def process_text(text: str, language: str = "a") -> List[int]:


def get_sentence_info(
text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a"
text: str, lang_code: str = "a"
) -> List[Tuple[str, List[int], int]]:
"""Process all sentences and return info"""
# Detect Chinese text
Expand All @@ -110,18 +110,10 @@ def get_sentence_info(
sentences = re.split(r"([,。!?;])+", text)
else:
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
phoneme_length, min_value = len(custom_phenomes_list), 0

results = []
for i in range(0, len(sentences), 2):
sentence = sentences[i].strip()
for replaced in range(min_value, phoneme_length):
current_id = f"</|custom_phonemes_{replaced}|/>"
if current_id in sentence:
sentence = sentence.replace(
current_id, custom_phenomes_list.pop(current_id)
)
min_value += 1
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
if not sentence:
continue
Expand Down Expand Up @@ -173,24 +165,23 @@ async def smart_split(
# Strip leading and trailing spaces to prevent pause tag splitting artifacts
text_part_raw = text_part_raw.strip()

# Apply the original smart_split logic to this text part
custom_phoneme_list = {}

# Normalize text (original logic)
processed_text = text_part_raw
if settings.advanced_text_normalization and normalization_options.normalize:
if lang_code in ["a", "b", "en-us", "en-gb"]:
processed_text = CUSTOM_PHONEMES.sub(
lambda s: handle_custom_phonemes(s, custom_phoneme_list), processed_text
)
processed_text = normalize_text(processed_text, normalization_options)
processed_text = CUSTOM_PHONEMES.split(processed_text)
for index in range(0, len(processed_text), 2):
processed_text[index] = normalize_text(processed_text[index], normalization_options)


processed_text = "".join(processed_text).strip()
else:
logger.info(
"Skipping text normalization as it is only supported for english"
)

# Process all sentences (original logic)
sentences = get_sentence_info(processed_text, custom_phoneme_list, lang_code=lang_code)
sentences = get_sentence_info(processed_text, lang_code=lang_code)

current_chunk = []
current_tokens = []
Expand Down
49 changes: 29 additions & 20 deletions api/tests/test_text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_process_text_chunk_phonemes():
def test_get_sentence_info():
"""Test sentence splitting and info extraction."""
text = "This is sentence one. This is sentence two! What about three?"
results = get_sentence_info(text, {})
results = get_sentence_info(text)

assert len(results) == 3
for sentence, tokens, count in results:
Expand All @@ -44,24 +44,6 @@ def test_get_sentence_info():
assert count == len(tokens)
assert count > 0


def test_get_sentence_info_phenomoes():
"""Test sentence splitting and info extraction."""
text = (
"This is sentence one. This is </|custom_phonemes_0|/> two! What about three?"
)
results = get_sentence_info(text, {"</|custom_phonemes_0|/>": r"sˈɛntᵊns"})

assert len(results) == 3
assert "sˈɛntᵊns" in results[1][0]
for sentence, tokens, count in results:
assert isinstance(sentence, str)
assert isinstance(tokens, list)
assert isinstance(count, int)
assert count == len(tokens)
assert count > 0


@pytest.mark.asyncio
async def test_smart_split_short_text():
"""Test smart splitting with text under max tokens."""
Expand All @@ -74,6 +56,33 @@ async def test_smart_split_short_text():
assert isinstance(chunks[0][0], str)
assert isinstance(chunks[0][1], list)

@pytest.mark.asyncio
async def test_smart_custom_phenomes():
"""Test smart splitting with text under max tokens."""
text = "This is a short test sentence. [Kokoro](/kˈOkəɹO/) has a feature called custom phenomes. This is made possible by [Misaki](/misˈɑki/), the custom phenomizer that [Kokoro](/kˈOkəɹO/) version 1.0 uses"
chunks = []
async for chunk_text, chunk_tokens, pause_duration in smart_split(text):
chunks.append((chunk_text, chunk_tokens, pause_duration))

# Should have 1 chunks: text
assert len(chunks) == 1

# First chunk: text
assert chunks[0][2] is None # No pause
assert "This is a short test sentence. [Kokoro](/kˈOkəɹO/) has a feature called custom phenomes. This is made possible by [Misaki](/misˈɑki/), the custom phenomizer that [Kokoro](/kˈOkəɹO/) version one uses" in chunks[0][0]
assert len(chunks[0][1]) > 0

@pytest.mark.asyncio
async def test_smart_split_only_phenomes():
"""Test input that is entirely made of phenome annotations."""
text = "[Kokoro](/kˈOkəɹO/) [Misaki 1.2](/misˈɑki/) [Test](/tɛst/)"
chunks = []
async for chunk_text, chunk_tokens, pause_duration in smart_split(text, max_tokens=10):
chunks.append((chunk_text, chunk_tokens, pause_duration))

assert len(chunks) == 1
assert "[Kokoro](/kˈOkəɹO/) [Misaki 1.2](/misˈɑki/) [Test](/tɛst/)" in chunks[0][0]


@pytest.mark.asyncio
async def test_smart_split_long_text():
Expand Down Expand Up @@ -116,7 +125,7 @@ def test_process_text_chunk_chinese_phonemes():
def test_get_sentence_info_chinese():
"""Test Chinese sentence splitting and info extraction."""
text = "这是一个句子。这是第二个句子!第三个问题?"
results = get_sentence_info(text, {}, lang_code="z")
results = get_sentence_info(text, lang_code="z")

assert len(results) == 3
for sentence, tokens, count in results:
Expand Down
70 changes: 63 additions & 7 deletions docker-bake.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,25 @@ target "_gpu_base" {
dockerfile = "docker/gpu/Dockerfile"
}

# Base settings for AMD ROCm builds
target "_rocm_base" {
inherits = ["_common"]
dockerfile = "docker/rocm/Dockerfile"
}

# CPU target with multi-platform support
target "cpu" {
inherits = ["_cpu_base"]
platforms = ["linux/amd64", "linux/arm64"]
platforms = ["linux/amd64"]
tags = [
"${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
"${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
]
}

target "cpu-arm64" {
inherits = ["_cpu_base"]
platforms = ["linux/arm64"]
tags = [
"${REGISTRY}/${OWNER}/${REPO}-cpu:${VERSION}",
"${REGISTRY}/${OWNER}/${REPO}-cpu:latest"
Expand All @@ -53,16 +68,51 @@ target "cpu" {
# GPU target with multi-platform support
target "gpu" {
inherits = ["_gpu_base"]
platforms = ["linux/amd64", "linux/arm64"]
platforms = ["linux/amd64"]
tags = [
"${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
"${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
]
}

# Default group to build both CPU and GPU versions
group "default" {
targets = ["cpu", "gpu"]
target "gpu-arm64" {
inherits = ["_gpu_base"]
platforms = ["linux/arm64"]
tags = [
"${REGISTRY}/${OWNER}/${REPO}-gpu:${VERSION}",
"${REGISTRY}/${OWNER}/${REPO}-gpu:latest"
]
}

# AMD ROCm target with multi-platform support
target "rocm" {
inherits = ["_rocm_base"]
platforms = ["linux/amd64"]
tags = [
"${REGISTRY}/${OWNER}/${REPO}-rocm:${VERSION}",
"${REGISTRY}/${OWNER}/${REPO}-rocm:latest"
]
}

# Build groups for parallel builds
group "cpu" {
targets = ["cpu"]
}

group "cpu-arm64" {
targets = ["cpu-arm64"]
}

group "gpu-arm64" {
targets = ["gpu-arm64"]
}

group "gpu" {
targets = ["gpu"]
}

group "rocm" {
targets = ["rocm"]
}

# Development targets for faster local builds
Expand All @@ -78,6 +128,12 @@ target "gpu-dev" {
tags = ["${REGISTRY}/${OWNER}/${REPO}-gpu:dev"]
}

target "rocm-dev" {
inherits = ["_rocm_base"]
# No multi-platform for dev builds
tags = ["${REGISTRY}/${OWNER}/${REPO}-rocm:dev"]
}

group "dev" {
targets = ["cpu-dev", "gpu-dev"]
}
targets = ["cpu-dev", "gpu-dev", "rocm-dev"]
}
48 changes: 17 additions & 31 deletions docker/cpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,26 +1,17 @@
FROM python:3.10-slim

# Install dependencies and check espeak location
RUN apt-get update && apt-get install -y \
espeak-ng \
espeak-ng-data \
git \
libsndfile1 \
curl \
ffmpeg \
g++ \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /usr/share/espeak-ng-data \
&& ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/

# Install UV using the installer script
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
# Rust is required to build sudachipy and pyopenjtalk-plus
RUN apt-get update -y && \
apt-get install -y espeak-ng espeak-ng-data git libsndfile1 curl ffmpeg g++ && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
mkdir -p /usr/share/espeak-ng-data && \
ln -s /usr/lib/*/espeak-ng-data/* /usr/share/espeak-ng-data/ && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
mv /root/.local/bin/uv /usr/local/bin/ && \
mv /root/.local/bin/uvx /usr/local/bin/

# Create non-root user and set up directories and permissions
RUN useradd -m -u 1000 appuser && \
mv /root/.local/bin/uvx /usr/local/bin/ && \
curl https://sh.rustup.rs -sSf | sh -s -- -y && \
useradd -m -u 1000 appuser && \
mkdir -p /app/api/src/models/v1_0 && \
chown -R appuser:appuser /app

Expand All @@ -30,14 +21,9 @@ WORKDIR /app
# Copy dependency files
COPY --chown=appuser:appuser pyproject.toml ./pyproject.toml

# Install Rust (required to build sudachipy and pyopenjtalk-plus)
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/home/appuser/.cargo/bin:$PATH"

# Install dependencies
RUN --mount=type=cache,target=/root/.cache/uv \
uv venv --python 3.10 && \
uv sync --extra cpu
# Install dependencies with CPU extras
RUN uv venv --python 3.10 && \
uv sync --extra cpu --no-cache

# Copy project files including models
COPY --chown=appuser:appuser api ./api
Expand All @@ -46,21 +32,21 @@ COPY --chown=appuser:appuser docker/scripts/ ./
RUN chmod +x ./entrypoint.sh

# Set environment variables
ENV PYTHONUNBUFFERED=1 \
ENV PATH="/home/appuser/.cargo/bin:/app/.venv/bin:$PATH" \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app:/app/api \
PATH="/app/.venv/bin:$PATH" \
UV_LINK_MODE=copy \
USE_GPU=false \
PHONEMIZER_ESPEAK_PATH=/usr/bin \
PHONEMIZER_ESPEAK_DATA=/usr/share/espeak-ng-data \
ESPEAK_DATA_PATH=/usr/share/espeak-ng-data
ESPEAK_DATA_PATH=/usr/share/espeak-ng-data \
DEVICE="cpu"

ENV DOWNLOAD_MODEL=true
# Download model if enabled
RUN if [ "$DOWNLOAD_MODEL" = "true" ]; then \
python download_model.py --output api/src/models/v1_0; \
fi

ENV DEVICE="cpu"
# Run FastAPI server through entrypoint.sh
CMD ["./entrypoint.sh"]
Loading