Skip to content

Commit

Permalink
WIP transformer support
Browse files Browse the repository at this point in the history
  • Loading branch information
remichu-ai committed Sep 29, 2024
1 parent 82e1708 commit fd621c2
Show file tree
Hide file tree
Showing 10 changed files with 686 additions and 25 deletions.
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ RUN pip install flash-attn==2.6.3 --no-build-isolation
#RUN cd exllamav2 && pip install -r requirements.txt && pip install .
RUN pip install -r requirements.txt
RUN pip install https://github.com/turboderp/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl

#RUN pip install git+https://github.com/huggingface/transformers
RUN pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
#git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
#pip install -vvv --no-build-isolation -e .
RUN pip install gallama

# Clean up
Expand Down
3 changes: 0 additions & 3 deletions src/gallama/api_response/chat_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,6 @@ async def chat_completion_response(
eos = False
while not eos:
try:
# if await request.is_disconnected():
# logger.info("Request disconnected, stopping queue processing")
# break

result = gen_queue.get_nowait()
if isinstance(result, GenText) and result.text_type=="text":
Expand Down
9 changes: 8 additions & 1 deletion src/gallama/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
# optional dependency
ExLlamaV2Cache_TP = None

try:
from gallama.backend.chatgenerator import ChatGeneratorTransformers
except:
ChatGeneratorTransformers = None


# Add this after your imports to clear logging from 3rd party module

Expand Down Expand Up @@ -150,7 +155,8 @@ async def chat_completion(request: Request, query: ChatMLQuery):
logger.info(f"thinking is used with returnThinking set to {query.return_thinking}")

# start the generation task
asyncio.create_task(llm.chat(
asyncio.create_task(
llm.chat(
query=query,
prompt_eng=prompt_eng,
gen_queue=gen_queue,
Expand Down Expand Up @@ -279,6 +285,7 @@ def load_model(model_spec: ModelParser):
chat_generator_dict = {
"exllama": ChatGenerator,
"llama_cpp": ChatGeneratorLlamaCpp,
"transformers": ChatGeneratorTransformers,
}

chatGenerator_to_use = chat_generator_dict[llm_base.backend]
Expand Down
Loading

0 comments on commit fd621c2

Please sign in to comment.