WIP transformer support

remichu-ai · Sep 29, 2024 · fd621c2 · fd621c2
1 parent 82e1708
commit fd621c2
Show file tree

Hide file tree

Showing 10 changed files with 686 additions and 25 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -63,7 +63,10 @@ RUN pip install flash-attn==2.6.3 --no-build-isolation
 #RUN cd exllamav2 && pip install -r requirements.txt && pip install .
 RUN pip install -r requirements.txt
 RUN pip install https://github.com/turboderp/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.3.1-cp311-cp311-linux_x86_64.whl
-
+#RUN pip install git+https://github.com/huggingface/transformers
+RUN pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
+#git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+#pip install -vvv --no-build-isolation -e .
 RUN pip install gallama
 
 # Clean up

diff --git a/src/gallama/api_response/chat_response.py b/src/gallama/api_response/chat_response.py
@@ -257,9 +257,6 @@ async def chat_completion_response(
     eos = False
     while not eos:
         try:
-            # if await request.is_disconnected():
-            #     logger.info("Request disconnected, stopping queue processing")
-            #     break
 
             result = gen_queue.get_nowait()
             if isinstance(result, GenText) and result.text_type=="text":

diff --git a/src/gallama/app.py b/src/gallama/app.py
@@ -52,6 +52,11 @@
     # optional dependency
     ExLlamaV2Cache_TP = None
 
+try:
+    from gallama.backend.chatgenerator import ChatGeneratorTransformers
+except:
+    ChatGeneratorTransformers =  None
+
 
 # Add this after your imports to clear logging from 3rd party module
 
@@ -150,7 +155,8 @@ async def chat_completion(request: Request, query: ChatMLQuery):
             logger.info(f"thinking is used with returnThinking set to {query.return_thinking}")
 
         # start the generation task
-        asyncio.create_task(llm.chat(
+        asyncio.create_task(
+            llm.chat(
             query=query,
             prompt_eng=prompt_eng,
             gen_queue=gen_queue,
@@ -279,6 +285,7 @@ def load_model(model_spec: ModelParser):
         chat_generator_dict = {
             "exllama": ChatGenerator,
             "llama_cpp": ChatGeneratorLlamaCpp,
+            "transformers": ChatGeneratorTransformers,
         }
 
         chatGenerator_to_use = chat_generator_dict[llm_base.backend]