simple-inference (#47)

gauron99 · web-flow · commit 509fffdcec3f · 2025-07-29T11:36:33.000+02:00
diff --git a/python/llamacpp/README.md b/python/llamacpp/README.md
@@ -0,0 +1,91 @@
+# Python Llama_cpp Function (HTTP)
+
+Welcome to your Llama-cpp Function which integrates a basic client side structure
+of the [Llama-cpp library](). The Function accepts JSON input which it processes
+through a local LLM and returns the generated response.
+
+The Function itself uses ASGI protocol.
+
+## Deployment
+
+> [!Note]
+> We recommend using the host builder
+
+```bash
+#Run the function locally
+func run --builder=host
+
+#Deploy to clustert
+func deploy --builder=host
+```
+
+## How to use the API
+
+The Function accepts POST requests with JSON data. You can create a request like
+this:
+```bash
+curl localhost:8080 -d '{"input":"The largest mountain in the world is"}'
+```
+
+GET requests return 'OK' string for a quick check.
+
+## Customization
+
+- The Function uses the ASGI protocol and is compatible with
+`handle(scope,receive,send)` signature.
+- You can use a local model (eg: passed through via a base image -- Dockerfile)
+by switching the `Llama()` function calls in the `handle()` function for the
+commented out code. You will need to provide a path to the model via `model_path`
+argument instead of a `repo_id` and `filename`.
+- As per usual, the Function implements a readiness and liveness checks as well
+as start and stop methods implemented via functions matching their names
+respectivelly. These can be found at the bottom of the Function class with more
+detailed information in the comments.
+
+## Tests
+
+Tests use the `pytest` framework with asyncio.
+
+The function tests can be found in `tests` directory. It contains a simple
+http request test. This is where you can create your own tests for desired
+functionality.
+
+```bash
+#Install dependencies (if not done already)
+pip install -e .
+
+# Run the tests
+pytest
+
+# Run verbosely
+pytest -v
+```
+
+## Dependencies
+
+All dependencies can be found in the `pyproject.toml` file. Any additional
+dependencies (eg: A model when running locally) can be also provided via the
+mentioned base image. You can create a Dockerfile like so:
+
+```Dockerfile
+FROM python3.13:slim
+## RUN any bash commands for pip install etc.
+COPY /path/to/model/on/host/machine /path/to/model/in/container
+```
+
+You will build this image for example using podman and then pass it into the
+Function when building it via `--base-image` flag.
+```bash
+# build my base image
+podman build -f Dockerfile -t my-base-image
+
+# use the base image when building my Function image
+func build --base-image=localhost/my-base-image --builder=host
+
+# or deploy immediately (builds internally)
+func deploy --base-image=localhost/my-base-image --builder=host
+```
+
+which will make the model accesible for the Function.
+
+For more, see [the complete documentation]('https://github.com/knative/func/tree/main/docs')
diff --git a/python/llamacpp/function/__init__.py b/python/llamacpp/function/__init__.py
@@ -0,0 +1 @@
+from .func import new
diff --git a/python/llamacpp/function/func.py b/python/llamacpp/function/func.py
@@ -0,0 +1,127 @@
+# Function
+import logging
+from llama_cpp import Llama
+import json
+
+def new():
+    """ New is the only method that must be implemented by a Function.
+    The instance returned can be of any name.
+    """
+    return Function()
+
+class Function:
+    def __init__(self):
+        """ The init method is an optional method where initialization can be
+        performed. See the start method for a startup hook which includes
+        configuration.
+        """
+
+    async def sender(self,send,obj):
+        # echo the obj to the calling client
+        await send({
+            'type': 'http.response.start',
+            'status': 200,
+            'headers': [
+                [b'content-type', b'text/plain'],
+            ],
+        })
+        await send({
+            'type': 'http.response.body',
+            'body': obj.encode(),
+        })
+
+    async def handle(self, scope, receive, send):
+        """
+        accepts data in form of JSON with the key "input" which should
+        contain the input string for the LLM
+        {
+            "input": "this is passed to the LLM"
+        }
+        ex: curl localhost:8080 -d '{"input":"The largest mountain in the world is"}'
+        """
+        if scope["method"] == "GET":
+            await self.sender(send,"OK")
+            return
+
+        input = ""
+
+        # fetch all of the body from request
+        body = b''
+        more_body = True
+        while more_body:
+            message = await receive()
+            body += message.get('body', b'')
+            more_body = message.get('more_body', False)
+        # decode json
+        try:
+            data = json.loads(body.decode('utf-8'))
+            input = data['input']
+        except json.JSONDecodeError:
+            ret = "Invalid Json"
+        except KeyError:
+            ret = "invalid key, expected 'input'"
+
+        if input == "":
+            self.sender(send,"OK")
+
+        # Pull model from Hugging Face Hub
+        llm = Llama.from_pretrained(
+            repo_id="ibm-granite/granite-3b-code-base-2k-GGUF",
+            filename="granite-3b-code-base.Q4_K_M.gguf",
+            n_ctx=1024,
+        )
+
+       ## Use a local image instead
+       #llm = Llama (
+       #     model_path = "/granite-7b-lab-Q4_K_M.gguf/snapshots/sha256-6adeaad8c048b35ea54562c55e454cc32c63118a32c7b8152cf706b290611487/granite-7b-lab-Q4_K_M.gguf",
+       #     n_ctx = 1024,
+       # )
+
+        output = llm(
+            input,
+            max_tokens=32,
+            ## Stop generating just before "Q:"; doesnt work well with small models
+            ## some models are more tuned to the Q: ... A: ... "chat"
+            ## You would literally type that in your input as: f' Q: {input}. A:'
+            #stop=["Q:","\n"],
+            echo=False,
+        )
+        #logging.info("------------")
+        #logging.info(output['choices'][0]['text'])
+        await self.sender(send,output['choices'][0]['text'])
+
+    def start(self, cfg):
+        """ start is an optional method which is called when a new Function
+        instance is started, such as when scaling up or during an update.
+        Provided is a dictionary containing all environmental configuration.
+        Args:
+            cfg (Dict[str, str]): A dictionary containing environmental config.
+                In most cases this will be a copy of os.environ, but it is
+                best practice to use this cfg dict instead of os.environ.
+        """
+        logging.info("Function starting")
+
+    def stop(self):
+        """ stop is an optional method which is called when a function is
+        stopped, such as when scaled down, updated, or manually canceled.  Stop
+        can block while performing function shutdown/cleanup operations.  The
+        process will eventually be killed if this method blocks beyond the
+        platform's configured maximum studown timeout.
+        """
+        logging.info("Function stopping")
+
+    def alive(self):
+        """ alive is an optional method for performing a deep check on your
+        Function's liveness.  If removed, the system will assume the function
+        is ready if the process is running. This is exposed by default at the
+        path /health/liveness.  The optional string return is a message.
+        """
+        return True, "Alive"
+
+    def ready(self):
+        """ ready is an optional method for performing a deep check on your
+        Function's readiness.  If removed, the system will assume the function
+        is ready if the process is running.  This is exposed by default at the
+        path /health/rediness.
+        """
+        return True, "Ready"
diff --git a/python/llamacpp/manifest.yaml b/python/llamacpp/manifest.yaml
@@ -0,0 +1,2 @@
+build:
+  base-image: quay.io/dfridric/custom_llamacpp_base
diff --git a/python/llamacpp/pyproject.toml b/python/llamacpp/pyproject.toml
@@ -0,0 +1,26 @@
+[project]
+name = "function"
+description = ""
+version = "0.1.0"
+requires-python = ">=3.9"
+readme = "README.md"
+license = "MIT"
+dependencies = [
+  "httpx",
+  "pytest",
+  "pytest-asyncio",
+  "llama_cpp-python",
+  "huggingface-hub"
+]
+authors = [
+  { name="Your Name", email="you@example.com"},
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.pytest.ini_options]
+asyncio_mode = "strict"
+asyncio_default_fixture_loop_scope = "function"
+
diff --git a/python/llamacpp/tests/test_func.py b/python/llamacpp/tests/test_func.py
@@ -0,0 +1,38 @@
+"""
+An example set of unit tests which confirm that the main handler (the
+callable function) returns 200 OK for a simple HTTP GET.
+"""
+import pytest
+from function import new
+
+
+@pytest.mark.asyncio
+async def test_function_handle():
+    f = new()  # Instantiate Function to Test
+
+    sent_ok = False
+    sent_headers = False
+    sent_body = False
+
+    # Mock Send
+    async def send(message):
+        nonlocal sent_ok
+        nonlocal sent_headers
+        nonlocal sent_body
+
+        if message.get('status') == 200:
+            sent_ok = True
+
+        if message.get('type') == 'http.response.start':
+            sent_headers = True
+
+        if message.get('type') == 'http.response.body':
+            sent_body = True
+
+    # Invoke the Function
+    await f.handle({}, {}, send)
+
+    # Assert send was called
+    assert sent_ok, "Function did not send a 200 OK"
+    assert sent_headers, "Function did not send headers"
+    assert sent_body, "Function did not send a body"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+build:`
	`2`	`+ base-image: quay.io/dfridric/custom_llamacpp_base`