Experimenting with Llama-2 support

oceanbio · Aug 6, 2023 · 7fa1fd9 · 7fa1fd9
1 parent 21abd56
commit 7fa1fd9
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -153,7 +153,7 @@ Since generated code is executed in your local environment, it can interact with
 
 **⚠️ Open Interpreter will ask for user confirmation before executing code.**
 
-You can run `interpreter -y` or set `interpreter.no_confirm = True` to bypass this confirmation, in which case:
+You can run `interpreter -y` or set `interpreter.auto_run = True` to bypass this confirmation, in which case:
 
 - Be cautious when requesting commands that modify files or system settings.
 - Watch Open Interpreter like a self-driving car, and be prepared to end the process by closing your terminal.

diff --git a/diskcache-5.6.1-py3-none-any.whl b/diskcache-5.6.1-py3-none-any.whl
diff --git a/interpreter/cli.py b/interpreter/cli.py
@@ -20,6 +20,10 @@ def cli(interpreter):
                       '--yes',
                       action='store_true',
                       help='execute code without user confirmation')
+  parser.add_argument('-l',
+                      '--local',
+                      action='store_true',
+                      help='run fully local with llama-2')
   args = parser.parse_args()
 
   if args.yes:
@@ -28,5 +32,8 @@ def cli(interpreter):
     # Print message with newlines on either side (aesthetic choice)
     print('', Markdown(confirm_mode_message), '')
 
+  if args.local:
+    interpreter.local = True
+
   # Now run the chat method
   interpreter.chat()
diff --git a/interpreter/interpreter.py b/interpreter/interpreter.py
@@ -13,7 +13,7 @@
 from rich import print
 from rich.markdown import Markdown
 
-# Function schema for function-calling GPTs
+# Function schema for GPT-4
 function_schema = {
   "name": "run_code",
   "description":
@@ -54,6 +54,7 @@ def __init__(self):
     self.temperature = 0.01
     self.api_key = None
     self.auto_run = False
+    self.local = False
 
     # Get default system message
     # here = os.path.abspath(os.path.dirname(__file__))
@@ -68,6 +69,11 @@ def __init__(self):
     # (blocks are visual representation of messages on the terminal)
     self.active_block = None
 
+    # Note: While Open Interpreter can use Llama, we will prioritize GPT-4.
+    # GPT-4 is faster, smarter, can call functions, and is all-around easier to use.
+    # This makes GPT-4 better aligned with Open Interpreters priority to be easy to use.
+    self.llama_instance = None
+
   def cli(self):
     # The cli takes the current instance of Interpreter,
     # modifies it according to command line flags, then runs chat.
@@ -86,14 +92,21 @@ def get_info_for_system_message(self):
     operating_system = os.name if os.name != 'nt' else os.uname().sysname
     info += f"\n\n[User Info]\nName: {username}\nCWD: {current_working_directory}\nOS: {operating_system}"
 
-    # Open Procedures is an open-source database of tiny, structured coding tutorials.
-    # We can query it semantically and append relevant tutorials to our system message:
-
-    # Get a procedure that's relevant to the last message
-    query = str(self.messages[-1])
-    url = f"https://open-procedures.replit.app/search/?query={query}"
-    relevant_procedure = requests.get(url).json()["procedure"]
-    info += "\n\n[Related Recommended Procedure] (might be irrelevant)\n" + relevant_procedure
+    if not self.local:
+
+      # Open Procedures is an open-source database of tiny, structured coding tutorials.
+      # We can query it semantically and append relevant tutorials to our system message:
+
+      # Get a procedure that's relevant to the last message
+      query = str(self.messages[-1])
+      url = f"https://open-procedures.replit.app/search/?query={query}"
+      relevant_procedure = requests.get(url).json()["procedure"]
+      info += "\n\n[Related Recommended Procedure]\n" + relevant_procedure
+
+    elif self.local:
+
+      # Tell Llama-2 how to run code.
+      info += "\n\nTo run Python code, simply write a Python code block (i.e ```python) in markdown. When you close it with ```, it will be run. You'll then be given its output."
 
     return info
 
@@ -105,7 +118,17 @@ def load(self, messages):
     self.messages = messages
 
   def chat(self, message=None, return_messages=False):
-    self.verify_api_key()
+
+    # Connect to an LLM
+    if not self.local:
+      # GPT-4
+      self.verify_api_key()
+    elif self.local:
+      # Llama-2
+      if self.llama_instance == None:
+        # Find or install LLama-2
+        from .llama_2 import llama_2
+        self.llama_instance = llama_2
 
     # Message won't be None if we're passing one in via interpreter.chat(message)
     # In that case, we respond non-interactivley and return:
@@ -163,29 +186,38 @@ def end_active_block(self):
       self.active_block = None
 
   def respond(self):
-
     # Add relevant info to system_message
     # (e.g. current working directory, username, os, etc.)
     info = self.get_info_for_system_message()
     system_message = self.system_message + "\n\n" + info
 
-    print("system_message:\n\n", system_message)
-
-    # Make OpenAI call
-    model = "gpt-4-0613"
-    response = openai.ChatCompletion.create(
-      model=model,
-      messages=tt.trim(self.messages,
-                       model,
-                       system_message=system_message),
-      functions=[function_schema],
-      stream=True,
-      temperature=self.temperature,
-    )
-
-    # Initialize
+    # Make LLM call
+    if not self.local:
+      # GPT-4
+      model = "gpt-4-0613"
+      response = openai.ChatCompletion.create(
+        model=model,
+        messages=tt.trim(self.messages,
+                         model,
+                         system_message=system_message),
+        functions=[function_schema],
+        stream=True,
+        temperature=self.temperature,
+      )
+    elif self.local:
+      # Llama-2
+      response = self.llama_instance.create_chat_completion(
+        messages=tt.trim(self.messages,
+                         "gpt-3.5-turbo",
+                         system_message=system_message),
+        stream=True,
+        temperature=self.temperature,
+      )
+
+    # Initialize message, function call trackers, and active block
     self.messages.append({})
     in_function_call = False
+    llama_function_call_finished = False
     self.active_block = None
 
     for chunk in response:
@@ -196,7 +228,15 @@ def respond(self):
       self.messages[-1] = merge_deltas(self.messages[-1], delta)
 
       # Check if we're in a function call
-      if "function_call" in self.messages[-1]:
+      if not self.local:
+        condition = "function_call" in self.messages[-1]
+      elif self.local:
+        # Since Llama-2 can't call functions, we just check if we're in a code block.
+        # This simply returns true if the number of "```" in the message is odd.
+        condition = self.messages[-1]["content"].count("```") % 2 == 1
+
+      if condition:
+        # We are in a function call.
 
         # Check if we just entered a function call
         if in_function_call == False:
@@ -216,18 +256,40 @@ def respond(self):
         # Remember we're in a function_call
         in_function_call = True
 
-        # Parse arguments and save to parsed_arguments, under function_call
-        if "arguments" in self.messages[-1]["function_call"]:
-          arguments = self.messages[-1]["function_call"]["arguments"]
-          new_parsed_arguments = parse_partial_json(arguments)
-
-          if new_parsed_arguments:
-            # Only overwrite what we have if it's not None (which means it failed to parse)
-            self.messages[-1]["function_call"]["parsed_arguments"] = new_parsed_arguments
+        # Now let's parse the function's arguments:
+
+        if not self.local:
+          # GPT-4
+          # Parse arguments and save to parsed_arguments, under function_call
+          if "arguments" in self.messages[-1]["function_call"]:
+            arguments = self.messages[-1]["function_call"]["arguments"]
+            new_parsed_arguments = parse_partial_json(arguments)
+            if new_parsed_arguments:
+              # Only overwrite what we have if it's not None (which means it failed to parse)
+              self.messages[-1]["function_call"]["parsed_arguments"] = new_parsed_arguments
+
+        elif self.local:
+          # Llama-2
+          # Get contents of current code block and save to parsed_arguments, under function_call
+          current_code_block = self.messages[-1]["content"].split("```")[-1]
+          arguments = {"language": "python", "code": current_code_block}
+          self.messages[-1]["function_call"]["parsed_arguments"] = arguments
 
       else:
+        # We are not in a function call.
+
+        # Check if we just left a function call
+        if in_function_call == True:
+
+          if self.local:
+            # This is the same as when GPT-4 gives finish_reason as function_call.
+            # We have just finished a code block, so now we should run it.
+            llama_function_call_finished = True
+
+        # Remember we're not in a function_call
+        in_function_call = False
 
-        # If we're not in a function call and there's no active block,
+        # If there's no active block,
         if self.active_block == None:
 
           # Create a message block
@@ -237,8 +299,8 @@ def respond(self):
       self.active_block.update_from_message(self.messages[-1])
 
       # Check if we're finished
-      if chunk.choices[0].finish_reason:
-        if chunk.choices[0].finish_reason == "function_call":
+      if chunk.choices[0].finish_reason or llama_function_call_finished:
+        if chunk.choices[0].finish_reason == "function_call" or llama_function_call_finished:
           # Time to call the function!
           # (Because this is Open Interpreter, we only have one function.)
 
@@ -300,4 +362,4 @@ def respond(self):
         if chunk.choices[0].finish_reason != "function_call":
           # Done!
           self.active_block.end()
-          return
+          return
diff --git a/interpreter/llama_2.py b/interpreter/llama_2.py
@@ -0,0 +1,41 @@
+import os
+import time
+import subprocess
+
+# Define the file name to search for
+file_name = "llama-2-13b-chat.ggmlv3.q4_0.bin"
+
+# Start the timer
+start_time = time.time()
+
+# Check for the file in each path
+for path in [os.path.expanduser("~"), os.getcwd()]:
+    print(f"Searching for Llama-2 in {path} ...")
+    for root, _, files in os.walk(path):
+        if time.time() - start_time > 5:
+            print("Search timed out after 5 seconds.")
+            break
+        if file_name in files:
+            model_path = os.path.join(root, file_name)
+            print(f"Found Llama-2 at {model_path}")
+            break
+    else:
+        continue
+    break
+else:
+    # If the file was not found, download it
+    download_path = os.path.expanduser("~") + "/llama-2/" + file_name
+    print(f"Llama-2 not found. Downloading it to {download_path} ...")
+    url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
+    subprocess.run(f"curl -L '{url}' -o {download_path}", shell=True)
+    model_path = download_path
+
+try:
+  from llama_cpp import Llama
+except:
+  print("Downloading Llama-2 interface (llama-cpp-python)...")
+  subprocess.run(["pip", "install", "llama-cpp-python"])
+  from llama_cpp import Llama
+
+# Initialize Llama-2
+llama_2 = Llama(model_path=model_path)
diff --git a/numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl b/numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
diff --git a/tests/test_interpreter.py b/tests/test_interpreter.py
@@ -1,20 +1,24 @@
 import interpreter
-interpreter.no_confirm = True
+interpreter.auto_run = True
 interpreter.temperature = 0
 
 def test_delayed_exec():
     interpreter.reset()
     interpreter.chat("""Can you write a single block of code and run_code it that prints something, then delays 5 seconds, then prints something else? No talk just code. Thanks!""", return_messages=True)
 
+def test_nested_loops_and_multiple_newlines():
+    interpreter.reset()
+    interpreter.chat("""Can you write a nested for loop in python and shell and run them? Also put 1-3 newlines between each line in the code. Thanks!""", return_messages=True)
+
 def test_math():
     interpreter.reset()
-    messages = interpreter.chat("""Please perform the calculation 27073*7397 then reply with just the integer answer, nothing else.""", return_messages=True)
+    messages = interpreter.chat("""Please perform the calculation 27073*7397 then reply with just the integer answer with no commas or anything, nothing else.""", return_messages=True)
     assert messages[-1] == {'role': 'assistant', 'content': '200258981'}
 
 def test_hello_world():
     interpreter.reset()
     messages = interpreter.chat("""Please reply with just the words "Hello, World!" and nothing else.""", return_messages=True)
-    assert messages == [{'role': 'user', 'content': 'Please reply with just the words "Hello, World!" and nothing else.'}, {'role': 'assistant', 'content': 'Hello, World!'}]
+    assert messages == [{'role': 'user', 'content': 'Please reply with just the words "Hello, World!" and nothing else. Do not run code.'}, {'role': 'assistant', 'content': 'Hello, World!'}]
 
 def test_markdown():
     interpreter.reset()

diff --git a/typing_extensions-4.7.1-py3-none-any.whl b/typing_extensions-4.7.1-py3-none-any.whl