Skip to content

Commit

Permalink
Async: Add option to use Uvloop/Winloop
Browse files Browse the repository at this point in the history
These are faster event loops for asyncio which should improve overall
performance. Gate these under an experimental flag for now to stress
test these loops.

Signed-off-by: kingbri <bdashore3@proton.me>
  • Loading branch information
bdashore3 committed Jul 24, 2024
1 parent 71de306 commit 5c082b7
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 50 deletions.
7 changes: 6 additions & 1 deletion common/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,12 @@ def add_developer_args(parser: argparse.ArgumentParser):
developer_group.add_argument(
"--cuda-malloc-backend",
type=str_to_bool,
help="Disables API request streaming",
help="Runs with the pytorch CUDA malloc backend",
)
developer_group.add_argument(
"--uvloop",
type=str_to_bool,
help="Run asyncio using Uvloop or Winloop",
)


Expand Down
5 changes: 5 additions & 0 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ developer:
# This can save a few MBs of VRAM, but has a risk of errors. Use at your own risk.
#cuda_malloc_backend: False

# Enable Uvloop or Winloop (default: False)
# Make the program utilize a faster async event loop which can improve performance
# NOTE: It's recommended to enable this, but if something breaks, turn this off.
#uvloop: False

# Options for model overrides and loading
# Please read the comments to understand how arguments are handled between initial and API loads
model:
Expand Down
5 changes: 5 additions & 0 deletions endpoints/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
Expand Down Expand Up @@ -71,11 +72,15 @@ async def start_api(host: str, port: int):
# Setup app
app = setup_app()

# Get the current event loop
loop = asyncio.get_running_loop()

config = uvicorn.Config(
app,
host=host,
port=port,
log_config=UVICORN_LOG_CONFIG,
loop=loop,
)
server = uvicorn.Server(config)

Expand Down
110 changes: 63 additions & 47 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""

import asyncio
import aiofiles
import json
import os
import pathlib
import platform
import signal
from loguru import logger
from typing import Optional
Expand All @@ -23,51 +23,8 @@
from backends.exllamav2.utils import check_exllama_version


async def entrypoint(args: Optional[dict] = None):
"""Entry function for program startup"""

setup_logger()

# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

if os.getenv("EXPORT_OPENAPI", "").lower() in ("true", "1"):
openapi_json = export_openapi()

async with aiofiles.open("openapi.json", "w") as f:
await f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")

return

# Load from YAML config
config.from_file(pathlib.Path("config.yml"))

# Parse and override config from args
if args is None:
parser = init_argparser()
args = convert_args_to_dict(parser.parse_args(), parser)

config.from_args(args)

developer_config = config.developer_config()

# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely

if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()

# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("Enabled the experimental CUDA malloc backend.")
async def entrypoint_async():
"""Async entry function for program startup"""

network_config = config.network_config()

Expand Down Expand Up @@ -131,5 +88,64 @@ async def entrypoint(args: Optional[dict] = None):
await start_api(host, port)


def entrypoint(arguments: Optional[dict] = None):
setup_logger()

# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

if do_export_openapi:
openapi_json = export_openapi()

with open("openapi.json", "w") as f:
f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")

return

# Load from YAML config
config.from_file(pathlib.Path("config.yml"))

# Parse and override config from args
if arguments is None:
parser = init_argparser()
arguments = convert_args_to_dict(parser.parse_args(), parser)

config.from_args(arguments)
developer_config = config.developer_config()

# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely

if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()

# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")

# Use Uvloop/Winloop
if unwrap(developer_config.get("uvloop"), False):
if platform.system() == "Windows":
from winloop import install
else:
from uvloop import install

# Set loop event policy
install()

logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")

# Enter into the async event loop
asyncio.run(entrypoint_async())


if __name__ == "__main__":
asyncio.run(entrypoint())
entrypoint()
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ dependencies = [
"lm-format-enforcer >= 0.9.6",
"aiofiles",

# Improved asyncio loops
"uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
"winloop ; platform_system == 'Windows'",

# TEMP: Remove once 2.x is fixed in upstream
"numpy < 2.0.0",

Expand Down
4 changes: 2 additions & 2 deletions start.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Utility to automatically upgrade and start the API"""

import asyncio
import argparse
import os
import pathlib
Expand Down Expand Up @@ -159,4 +158,5 @@ def add_start_args(parser: argparse.ArgumentParser):
# Import entrypoint after installing all requirements
from main import entrypoint

asyncio.run(entrypoint(convert_args_to_dict(args, parser)))
converted_args = convert_args_to_dict(args, parser)
entrypoint(converted_args)

0 comments on commit 5c082b7

Please sign in to comment.