Skip to content

Add Qwen 2.5 Coder Model Integration Add/qwen coder #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,23 @@

---

**Open Codex** is a fully open-source command-line AI assistant inspired by OpenAI Codex, supporting local language models like `phi-4-mini`.
**Open Codex** is a fully open-source command-line AI assistant inspired by OpenAI Codex, supporting optimized local language models.

No API key is required. Everything runs locally.
No API key is required for the default model. Everything runs locally.

Supports:
- **One-shot mode**: `open-codex "list all folders"` -> returns shell command
- 🧠 Local-only execution using supported OS models (currently `phi-4-mini`)
- 🧠 Local-only execution using optimized models:
- phi-4-mini (default, no auth required)
- qwen1.5-7b-chat (auth required, enhanced for coding tasks)

---
## ✨ Features

- Natural Language to Shell Command (via local models)
- Works on macOS, Linux, and Windows (Python-based)
- Confirmation before execution
- Smart command validation and error handling
- Real-time command output streaming
- Add to clipboard / abort / execute prompt
- One-shot interaction mode (interactive and function-calling coming soon)
- Colored terminal output for better readability
Expand Down Expand Up @@ -76,13 +79,42 @@ Once installed, you can use the `open-codex` CLI globally.

### One-shot mode

Basic usage with default model (phi-4-mini):
```bash
open-codex "untar file abc.tar"
open-codex "list all python files"
```

βœ… Codex suggests a shell command
Using Qwen model for enhanced coding tasks:
```bash
# First, set your Hugging Face token
export HUGGINGFACE_TOKEN=your_token_here

# Then use the Qwen model
open-codex --model qwen-2.5-coder "find python files modified today"

# Or provide token directly
open-codex --model qwen-2.5-coder --hf-token your_token_here "your command"
```

βœ… Codex suggests a validated shell command
βœ… Shows real-time command output
βœ… Provides clear error messages
βœ… Asks for confirmation / add to clipboard / abort
βœ… Executes if approved
βœ… Executes if approved

### Model Overview

#### phi-4-mini (Default)
- Fast and lightweight
- No authentication required
- Optimized for quick shell commands
- Best for basic file operations and system tasks

#### qwen1.5-7b-chat
- Enhanced for coding tasks
- Requires Hugging Face authentication
- Improved command validation
- Better for complex development tasks

---

Expand Down
15 changes: 12 additions & 3 deletions src/open_codex/agent_builder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from importlib.resources import files
from typing import Literal, Optional

from open_codex.agents.phi_4_mini import AgentPhi4Mini
from open_codex.agents.qwen_25_coder import AgentQwen25Coder
from open_codex.interfaces.llm_agent import LLMAgent

class AgentBuilder:
ModelType = Literal["phi-4-mini", "qwen-2.5-coder"]

class AgentBuilder:
@staticmethod
def get_agent() -> LLMAgent:
def get_agent(model: ModelType = "phi-4-mini", hf_token: Optional[str] = None) -> LLMAgent:
system_prompt = files("open_codex.resources").joinpath("prompt.txt").read_text(encoding="utf-8")
return AgentPhi4Mini(system_prompt=system_prompt)

if model == "phi-4-mini":
return AgentPhi4Mini(system_prompt=system_prompt)
elif model == "qwen-2.5-coder":
return AgentQwen25Coder(system_prompt=system_prompt, hf_token=hf_token)
else:
raise ValueError(f"Unsupported model: {model}")

@staticmethod
def read_file(file_path: str) -> str:
Expand Down
65 changes: 38 additions & 27 deletions src/open_codex/agents/phi_4_mini.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import contextlib
import os
import time
import multiprocessing
from typing import List, cast

from huggingface_hub import hf_hub_download # type: ignore
Expand All @@ -13,21 +14,19 @@ def download_model(self, model_filename: str,
repo_id: str,
local_dir: str) -> str:
print(
"\nπŸ€– Thank you for using Open Codex!\n"
"πŸ“¦ For the first run, we need to download the model from Hugging Face.\n"
"⏬ This only happens once – it’ll be cached locally for future use.\n"
"πŸ”„ Sit tight, the download will begin now...\n"
"\nπŸ€– Welcome to Open Codex!\n"
"πŸ“¦ First run requires downloading the model.\n"
"⚑️ This model is optimized for quick responses.\n"
)
print("\n⏬ Downloading model phi4-mini ...")

start = time.time()
model_path:str = hf_hub_download(
repo_id=repo_id,
filename=model_filename,
local_dir=local_dir,
)
end = time.time()
print(f"βœ… Model downloaded in {end - start:.2f}s\n")
duration = time.time() - start
print(f"βœ… Model downloaded ({duration:.1f}s)")
return model_path

def __init__(self, system_prompt: str):
Expand All @@ -36,41 +35,53 @@ def __init__(self, system_prompt: str):
local_dir = os.path.expanduser("~/.cache/open-codex")
model_path = os.path.join(local_dir, model_filename)

# check if the model is already downloaded
if not os.path.exists(model_path):
# download the model
model_path = self.download_model(model_filename, repo_id, local_dir)
else:
print(f"We are locking and loading the model for you...\n")
print("πŸš€ Loading Phi-4-mini model...")

# Get optimal thread count for the system
n_threads = min(4, multiprocessing.cpu_count())

# suppress the stderr output from llama_cpp
# this is a workaround for the llama_cpp library
# which prints a lot of warnings and errors to stderr
# when loading the model
# this is a temporary solution until the library is fixed
with AgentPhi4Mini.suppress_native_stderr():
lib_dir = os.path.join(os.path.dirname(__file__), "llama_cpp", "lib")
self.llm: Llama = Llama(
lib_path=os.path.join(lib_dir, "libllama.dylib"),
model_path=model_path)
lib_path = os.path.join(os.path.dirname(__file__), "llama_cpp", "lib", "libllama.dylib")
llama_kwargs = {
"model_path": model_path,
"n_ctx": 2048,
"n_threads": n_threads,
"n_batch": 256,
"use_mlock": True,
"use_mmap": True,
}

self.system_prompt = system_prompt
if os.path.exists(lib_path):
llama_kwargs["lib_path"] = lib_path

self.llm: Llama = Llama(**llama_kwargs)
print("✨ Model ready!")


self.system_prompt = system_prompt

def one_shot_mode(self, user_input: str) -> str:
chat_history = [{"role": "system", "content": self.system_prompt}]
chat_history.append({"role": "user", "content": user_input})
full_prompt = self.format_chat(chat_history)

with AgentPhi4Mini.suppress_native_stderr():
output_raw = self.llm(prompt=full_prompt, max_tokens=100, temperature=0.2, stream=False)
output_raw = self.llm(
prompt=full_prompt,
max_tokens=100,
temperature=0.2,
stream=False,
top_p=0.1, # More focused responses
repeat_penalty=1.1 # Reduce repetition
)

# unfortuntely llama_cpp has a union type for the output
output = cast(CreateCompletionResponse, output_raw)

assistant_reply : str = output["choices"][0]["text"].strip()
return assistant_reply


assistant_reply: str = output["choices"][0]["text"].strip()
return assistant_reply

def format_chat(self, messages: List[dict[str, str]]) -> str:
chat_prompt = ""
for msg in messages:
Expand Down
124 changes: 124 additions & 0 deletions src/open_codex/agents/qwen_25_coder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import time
import os
import multiprocessing
from typing import cast, Optional, List
from llama_cpp import CreateCompletionResponse, Llama
from open_codex.interfaces.llm_agent import LLMAgent
import contextlib
from huggingface_hub import hf_hub_download, login

class AgentQwen25Coder(LLMAgent):
def download_model(self, model_filename: str,
repo_id: str,
local_dir: str,
token: Optional[str] = None) -> str:
print(
"\nπŸ€– Welcome to Open Codex!\n"
"πŸ“¦ First run requires downloading the model.\n"
"⚑️ This model is optimized for quick responses.\n"
)

start = time.time()
model_path:str = hf_hub_download(
repo_id=repo_id,
filename=model_filename,
local_dir=local_dir,
token=token,
force_download=True, # Force download to ensure the latest version
)
duration = time.time() - start
print(f"βœ… Model downloaded ({duration:.1f}s)")
return model_path

def __init__(self, system_prompt: str, hf_token: Optional[str] = None):
model_filename = "Qwen2.5-Coder-1.5B-Instruct-F16.gguf" # Using correct model filename
repo_id = "unsloth/Qwen2.5-Coder-1.5B-Instruct-GGUF" # Using TheBloke's repository
local_dir = os.path.expanduser("~/.cache/open-codex")
model_path = os.path.join(local_dir, model_filename)

if not hf_token:
hf_token = os.environ.get("HUGGINGFACE_TOKEN")

if not os.path.exists(model_path):
model_path = self.download_model(model_filename, repo_id, local_dir, token=hf_token)
else:
print("πŸš€ Loading Qwen model...\n")

# Get optimal thread count for the system
n_threads = min(4, multiprocessing.cpu_count())

with AgentQwen25Coder.suppress_native_stderr():
self.llm: Llama = Llama(
model_path=model_path,
n_ctx=2048, # Smaller context for faster responses
n_threads=n_threads, # Use optimal thread count
n_batch=256, # Balanced batch size
use_mlock=True, # Lock memory to prevent swapping
use_mmap=True, # Use memory mapping for faster loading
)
print("✨ Model ready!")

self.system_prompt = system_prompt

def one_shot_mode(self, user_input: str) -> str:
chat_history = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": "I need a shell command to find all python files"},
{"role": "assistant", "content": "find . -name \"*.py\""},
{"role": "user", "content": user_input}
]
full_prompt = self.format_chat(chat_history)

with AgentQwen25Coder.suppress_native_stderr():
try:
output_raw = self.llm(
prompt=full_prompt,
max_tokens=100, # Limit response length
temperature=0.1, # Lower temperature for more deterministic output
top_p=0.1, # Focus on most likely tokens
top_k=10, # Limit vocabulary for shell commands
repeat_penalty=1.1,# Prevent repetition
stop=["<|im_end|>", "<|im_start|>", "\n"], # Stop at appropriate tokens
stream=False
)

output = cast(CreateCompletionResponse, output_raw)
assistant_reply: str = output["choices"][0]["text"].strip()

# Clean up response
assistant_reply = assistant_reply.split('\n')[0].strip()
assistant_reply = assistant_reply.replace("<|im_end|>", "").strip()

# Basic validation of shell commands
if any(invalid_char in assistant_reply for invalid_char in ['<', '>', '|/']):
return "find . -name \"*.py\"" # fallback to safe command

return assistant_reply

except Exception as e:
print(f"⚠️ Model error: {str(e)}")
return ""

def format_chat(self, messages: List[dict[str, str]]) -> str:
chat_prompt = ""
for msg in messages:
role_tag = "user" if msg["role"] == "user" else "assistant"
chat_prompt += f"<|{role_tag}|>\n{msg['content']}\n"
chat_prompt += "<|assistant|>\n"
return chat_prompt

@contextlib.contextmanager
@staticmethod
def suppress_native_stderr():
"""
Redirect C‐level stderr (fdΒ 2) into /dev/null, so llama.cpp logs vanish.
"""
devnull_fd = os.open(os.devnull, os.O_WRONLY)
saved_stderr_fd = os.dup(2)
try:
os.dup2(devnull_fd, 2)
yield
finally:
os.dup2(saved_stderr_fd, 2)
os.close(devnull_fd)
os.close(saved_stderr_fd)
Loading