Skip to content

feat(ui): add Token Estimator link to footer #337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,17 @@ If you ever get stuck, reach out on [Discord](https://discord.com/invite/zerRaGK
13. **Iterate** on any review feedback—update your branch and repeat **6 – 11** as needed.

*(Optional) Invite a maintainer to your branch for easier collaboration.*

---

## CSS & build artefacts

- **Do not commit `src/static/css/site.css`.** The CI pipeline runs `npm run build:css` during the container/image build, so the artefact is produced automatically.

*(Optional) Invite project maintainer to your branch for easier collaboration.*

- When developing locally you may run the build yourself (see step 9) so you can preview the styles.

## Dependency Management

When you add a new import from an external package, make sure to add it to both `requirements.txt` and `pyproject.toml` (if applicable). This ensures all environments and CI/CD pipelines have the correct dependencies installed.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies = [
"tiktoken>=0.7.0", # Support for o200k_base encoding
"typing_extensions>= 4.0.0; python_version < '3.10'",
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)
"autotiktokenizer=*",
]

license = {file = "LICENSE"}
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ slowapi
starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
tiktoken>=0.7.0 # Support for o200k_base encoding
uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150
autotiktokenizer
4 changes: 3 additions & 1 deletion src/server/routers/dynamic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""The dynamic router module defines handlers for dynamic path requests."""

from fastapi import APIRouter, Request
from fastapi import APIRouter, Depends, Request, HTTPException
from fastapi.responses import HTMLResponse

from server.server_config import templates
Expand Down Expand Up @@ -29,6 +29,8 @@ async def catch_all(request: Request, full_path: str) -> HTMLResponse:
and other default parameters such as file size.

"""
if full_path.startswith("api/"):
raise HTTPException(status_code=405, detail="Method Not Allowed")
return templates.TemplateResponse(
"git.jinja",
{
Expand Down
176 changes: 175 additions & 1 deletion src/server/routers/index.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,66 @@
"""Module defining the FastAPI router for the home page of the application."""

from fastapi import APIRouter, Request
from fastapi import APIRouter, Depends, Request, Form, HTTPException

from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from autotiktokenizer import AutoTikTokenizer
import tiktoken
from typing import Optional

from gitingest.utils.compat_typing import Annotated
from server.models import QueryForm
from server.query_processor import process_query
from server.server_config import EXAMPLE_REPOS, templates
from server.server_utils import limiter
from pydantic import BaseModel, Field


router = APIRouter()

templates = Jinja2Templates(directory="server/templates")

SUPPORTED_MODELS = {
'GPT-2 (OpenAI)': 'openai-community/gpt2',
'GPT-3 (OpenAI)': 'openai-community/gpt2',
'GPT-3.5 (OpenAI)': 'openai-community/gpt2',
'GPT-3.5-turbo (OpenAI)': 'openai-community/gpt2',
'GPT-4 (OpenAI)': 'openai-community/gpt2',
'Claude (approximate, uses GPT-2)': 'openai-community/gpt2',
'Gemini (approximate, uses T5)': 't5-base',
'Llama-2 (Meta)': 'meta-llama/Llama-2-7b-hf',
'Llama-3 (Meta)': 'meta-llama/Meta-Llama-3-8B',
'Mistral-7B (MistralAI)': 'mistralai/Mistral-7B-v0.1',
'Mixtral-8x7B (MistralAI)': 'mistralai/Mixtral-8x7B-v0.1',
'Phi-3-mini (Microsoft)': 'microsoft/phi-3-mini-4k-instruct',
'Gemma-2B (Google)': 'google/gemma-2b',
'Qwen2-7B (Alibaba)': 'Qwen/Qwen2-7B',
'Yi-34B (01.AI)': '01-ai/Yi-34B-Chat',
'Falcon-7B (TII)': 'tiiuae/falcon-7b',
'MPT-7B (MosaicML)': 'mosaicml/mpt-7b',
'Baichuan-7B (Baichuan)': 'baichuan-inc/Baichuan-7B',
'XLM-RoBERTa-base (Facebook)': 'xlm-roberta-base',
'RoBERTa-base (Facebook)': 'roberta-base',
'DistilBERT-base-uncased': 'distilbert-base-uncased',
'GPT-Neo-1.3B (EleutherAI)': 'EleutherAI/gpt-neo-1.3B',
'GPT-J-6B (EleutherAI)': 'EleutherAI/gpt-j-6B',
'GPT-Bloom-560m (BigScience)': 'bigscience/bloom-560m',
'BERT-base-uncased': 'bert-base-uncased',
'T5-base': 't5-base',
}
# Note: Gemini and Claude use approximate tokenizers (T5 and GPT-2, respectively) as no official public tokenizers exist for these models.

def get_tokenizer(model_id):
return AutoTikTokenizer.from_pretrained(model_id)

def count_tokens(input_text, model_id):
if model_id == 'openai-community/gpt2':
# Use tiktoken for OpenAI models
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
return len(enc.encode(input_text))
else:
tokenizer = AutoTikTokenizer.from_pretrained(model_id)
return len(tokenizer.encode(input_text))

@router.get("/", response_class=HTMLResponse, include_in_schema=False)
async def home(request: Request) -> HTMLResponse:
Expand Down Expand Up @@ -35,3 +89,123 @@ async def home(request: Request) -> HTMLResponse:
"default_max_file_size": 243,
},
)


@router.post("/", response_class=HTMLResponse)
@limiter.limit("10/minute")
async def index_post(request: Request, form: Annotated[QueryForm, Depends(QueryForm.as_form)]) -> HTMLResponse:
"""Process the form submission with user input for query parameters.

This endpoint handles POST requests from the home page form. It processes the user-submitted
input (e.g., text, file size, pattern type) and invokes the ``process_query`` function to handle
the query logic, returning the result as an HTML response.

Parameters
----------
request : Request
The incoming request object, which provides context for rendering the response.
form : Annotated[QueryForm, Depends(QueryForm.as_form)]
The form data submitted by the user.

Returns
-------
HTMLResponse
An HTML response containing the results of processing the form input and query logic,
which will be rendered and returned to the user.

"""
resolved_token = form.token if form.token else None
return await process_query(
request,
input_text=form.input_text,
slider_position=form.max_file_size,
pattern_type=form.pattern_type,
pattern=form.pattern,
is_index=True,
token=resolved_token,
)


class TokenCountRequest(BaseModel):
input_text: str = Field(..., description="The text to count tokens for")
model_id: str = Field(default="openai-community/gpt2", description="The model ID to use for tokenization")

class TokenCountResponse(BaseModel):
token_count: int = Field(..., description="Number of tokens in the input text")
model_id: str = Field(..., description="Model ID used for tokenization")
character_count: int = Field(..., description="Number of characters in the input text")

@router.post("/api/tokencount", response_model=TokenCountResponse)
async def api_token_count(
request: Optional[TokenCountRequest] = None,
input_text: str = Form(None),
model_id: str = Form(default="openai-community/gpt2"),
):
"""
Count tokens in the provided text using the specified model's tokenizer.
Accepts both JSON and form data.
"""
# If JSON body was provided, use that
if request:
text = request.input_text
model = request.model_id
# Otherwise use form data
else:
text = input_text
model = model_id

if not text or not text.strip():
raise HTTPException(status_code=400, detail="Input text cannot be empty")

if model not in SUPPORTED_MODELS.values():
raise HTTPException(
status_code=400,
detail=f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}"
)

try:
token_count = count_tokens(text, model)
return TokenCountResponse(
token_count=token_count,
model_id=model,
character_count=len(text)
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

@router.get("/tokencount", response_class=HTMLResponse)
async def tokencount_ui(request: Request):
return templates.TemplateResponse(
"tokencount.jinja",
{"request": request, "supported_models": SUPPORTED_MODELS, "input_text": "", "model_id": "openai-community/gpt2", "result": None, "error": None}
)

@router.post("/tokencount", response_class=HTMLResponse)
async def tokencount_post(request: Request, input_text: str = Form(...), model_id: str = Form("openai-community/gpt2")):
error = None
result = None
if not input_text or not input_text.strip():
error = "Input text cannot be empty."
elif model_id not in SUPPORTED_MODELS.values():
error = f"Unsupported model ID. Must be one of: {', '.join(SUPPORTED_MODELS.values())}"
else:
try:
token_count = count_tokens(input_text, model_id)
result = {
"token_count": token_count,
"model_id": model_id,
"character_count": len(input_text)
}
except Exception as e:
error = str(e)
return templates.TemplateResponse(
"tokencount.jinja",
{
"request": request,
"supported_models": SUPPORTED_MODELS,
"input_text": input_text,
"model_id": model_id,
"result": result,
"error": error
}
)
26 changes: 26 additions & 0 deletions src/server/templates/components/footer.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,38 @@
<div class="grid grid-cols-2 items-center text-gray-900 text-sm">
{# Left column — Chrome + PyPI #}
<div class="flex items-center space-x-4">
<a href="https://chromewebstore.google.com/detail/git-ingest-turn-any-git-r/adfjahbijlkjfoicpjkhjicpjpjfaood"
target="_blank"
rel="noopener noreferrer"
class="hover:underline flex items-center">
<img src="https://img.icons8.com/ios/50/chrome--v1.png"
alt="chrome"
class="w-4 h-4 mr-1">
Extension
</a>
<a href="https://pypi.org/project/gitingest/"
target="_blank"
rel="noopener noreferrer"
class="hover:underline flex items-center">
<img src="https://img.icons8.com/windows/32/python.png"
alt="python"
class="w-4 h-4 mr-1">
Python package
</a>
<a href="/tokencount" class="hover:underline flex items-center" title="Backend API endpoint for token counting">
<img src="https://img.icons8.com/ios-filled/50/000000/counter.png" alt="token estimator" class="w-4 h-4 mr-1">
Token Estimator
</a>
{{ footer_icon_link('https://chromewebstore.google.com/detail/adfjahbijlkjfoicpjkhjicpjpjfaood',
'icons/chrome.svg',
'Chrome Extension') }}
{{ footer_icon_link('https://pypi.org/project/gitingest',
'icons/python.svg',
'Python Package') }}
{{ footer_icon_link('/tokencount',
'icons/tokens.svg',
'Token Estimator') }}

</div>
{# Right column - Discord #}
<div class="flex justify-end">
Expand Down
40 changes: 40 additions & 0 deletions src/server/templates/tokencount.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{% extends "base.jinja" %}
{% block title %}Token Estimator{% endblock %}
{% block content %}
<div class="relative">
<div class="w-full h-full absolute inset-0 bg-gray-900 rounded-xl translate-y-2 translate-x-2"></div>
<div class="rounded-xl relative z-20 p-8 sm:p-10 border-[3px] border-gray-900 bg-[#fff4da]">
<h1 class="text-3xl font-bold text-gray-900 mb-4">Token Estimator</h1>
<form method="post" action="/tokencount" class="space-y-6">
<div>
<label for="input_text" class="block mb-2 font-medium">Text to analyze:</label>
<textarea name="input_text" id="input_text" rows="4" required class="w-full border-[3px] border-gray-900 rounded p-2 mb-2 bg-[#E8F0FE] focus:outline-none">{{ input_text if input_text else '' }}</textarea>
</div>
<div class="mb-10">
<label for="model_id" class="block mb-2 font-medium">Model:</label>
<select name="model_id" id="model_id" class="w-full border-[3px] border-gray-900 rounded p-2 bg-[#E8F0FE] focus:outline-none">
{% for name, model in supported_models.items() %}
<option value="{{ model }}" {% if model_id == model %}selected{% endif %}>{{ name }}</option>
{% endfor %}
</select>
</div>
<div>
<button type="submit" class="bg-yellow-500 hover:bg-yellow-600 text-white font-bold py-2 px-4 rounded border-[3px] border-gray-900">Count Tokens</button>
</div>
</form>
{% if result %}
<div class="mt-6 p-4 border-[3px] border-gray-900 rounded bg-white">
<h2 class="text-xl font-semibold mb-2">Result</h2>
<p><b>Token count:</b> {{ result.token_count }}</p>
<p><b>Character count:</b> {{ result.character_count }}</p>
<p><b>Model:</b> {{ result.model_id }}</p>
</div>
{% endif %}
{% if error %}
<div class="mt-6 p-4 border-[3px] border-red-600 rounded bg-red-100 text-red-800">
<b>Error:</b> {{ error }}
</div>
{% endif %}
</div>
</div>
{% endblock %}
23 changes: 23 additions & 0 deletions tests/test_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from fastapi.testclient import TestClient
from src.server.main import app

client = TestClient(app, base_url="http://localhost")


def test_tokencount_valid():
response = client.post("/tokencount", json={"input_text": "Hello world!", "model_id": "openai-community/gpt2"}, headers={"host": "localhost"})
if response.status_code != 200:
print("Response content:", response.content)
assert response.status_code == 200
data = response.json()
assert "token_count" in data
assert isinstance(data["token_count"], int)
assert data["token_count"] > 0

def test_tokencount_missing_input():
response = client.post("/tokencount", json={"model_id": "openai-community/gpt2"}, headers={"host": "localhost"})
if response.status_code != 400:
print("Response content:", response.content)
assert response.status_code == 400
data = response.json()
assert "error" in data