Merge branch 'main' into vision-support

n3d1117 · n3d1117 · Dec 10, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 9, 2023
commit 28ec9f9eefba8ecb8698ebe1bf1d016f8057e9fa
diff --git a/.env.example b/.env.example
@@ -40,9 +40,15 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
 # TEMPERATURE=1.0
 # PRESENCE_PENALTY=0.0
 # FREQUENCY_PENALTY=0.0
-# IMAGE_SIZE=512x512
+# IMAGE_MODEL=dall-e-3
+# IMAGE_QUALITY=hd
+# IMAGE_STYLE=natural
+# IMAGE_SIZE=1024x1024
+# IMAGE_FORMAT=document
 # VISION_DETAIL="low"
 # GROUP_TRIGGER_KEYWORD=""
 # IGNORE_GROUP_TRANSCRIPTIONS=true
 # IGNORE_GROUP_VISION=true
+# TTS_MODEL="tts-1"
+# TTS_VOICE="alloy"
 # BOT_LANGUAGE=en
diff --git a/README.md b/README.md
diff --git a/bot/main.py b/bot/main.py
@@ -56,6 +56,8 @@ def main():
         'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
         'vision_detail': os.environ.get('VISION_DETAIL', 'low'),
         'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
     }
 
     if openai_config['enable_functions'] and not functions_available:
@@ -77,6 +79,7 @@ def main():
         'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
         'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
         'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
+        'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
         'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
         'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
         'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
@@ -90,6 +93,9 @@ def main():
         'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
         'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
         'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
+        'image_receive_mode': os.environ.get('IMAGE_FORMAT', "photo"),
+        'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
+        'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],
         'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
         'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
     }

diff --git a/bot/openai_helper.py b/bot/openai_helper.py
@@ -26,7 +26,8 @@
 GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
 GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
 GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
-GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS
+GPT_4_128K_MODELS = ("gpt-4-1106-preview",)
+GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS
 
 
 def default_max_tokens(model: str) -> int:
@@ -48,6 +49,8 @@ def default_max_tokens(model: str) -> int:
         return base * 8
     elif model in GPT_4_VISION_MODELS:
         return 4096
+    elif model in GPT_4_128K_MODELS:
+        return 4096
 
 
 def are_functions_available(model: str) -> bool:
@@ -400,7 +403,7 @@ async def interpret_image(self, chat_id, fileobj, prompt=None):
             message = {'role':'user', 'content':[{'type':'text', 'text':prompt}, {'type':'image_url', \
                         'image_url': {'url':f'data:image/jpeg;base64,{image}', 'detail':self.config['vision_detail'] } }]}
             common_args = {
-                'model': self.config['model'],
+                'model': 'gpt-4-vision-preview', # the only one that currently makes sense here
                 'messages': self.conversations[chat_id] + [message],
                 'temperature': self.config['temperature'],
                 'n': 1, # several choices is not implemented yet
@@ -493,6 +496,8 @@ def __max_model_tokens(self):
             return base * 8
         if self.config['model'] in GPT_4_VISION_MODELS:
             return base * 31
+        if self.config['model'] in GPT_4_128K_MODELS:
+            return base * 31
         raise NotImplementedError(
             f"Max tokens for model {self.config['model']} is not implemented yet."
         )
@@ -513,7 +518,7 @@ def __count_tokens(self, messages) -> int:
         if model in GPT_3_MODELS + GPT_3_16K_MODELS:
             tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
             tokens_per_name = -1  # if there's a name, the role is omitted
-        elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS:
+        elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS:
             tokens_per_message = 3
             tokens_per_name = 1
         else:
@@ -535,9 +540,9 @@ def __count_tokens_vision(self, fileobj) -> int:
         :return: the number of tokens required
         """
         image = Image.open(fileobj)
-        model = self.config['model']
+        model = 'gpt-4-vision-preview' # fixed for now
         if model not in GPT_4_VISION_MODELS:
-            raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
+            raise NotImplementedError(f"""count_tokens_vision() is not implemented for model {model}.""")
 
         w, h = image.size
         if w > h: w, h = h, w

diff --git a/bot/telegram_bot.py b/bot/telegram_bot.py
@@ -100,6 +100,7 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         (transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
          transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
         vision_today, vision_month = self.usage[user_id].get_current_vision_tokens()
+        characters_today, characters_month = self.usage[user_id].get_current_tts_usage()
         current_cost = self.usage[user_id].get_current_cost()
 
         chat_id = update.effective_chat.id
@@ -122,12 +123,17 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         text_today_vision = ""
         if self.config.get('enable_vision', False):
             text_today_vision = f"{vision_today} {localized_text('stats_vision', bot_language)}\n"
+
+        text_today_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n"
 
         text_today = (
             f"*{localized_text('usage_today', bot_language)}:*\n"
             f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
             f"{text_today_images}"  # Include the image statistics for today if applicable
             f"{text_today_vision}"
+            f"{text_today_tts}"
             f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
             f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
             f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
@@ -141,13 +147,18 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
         text_month_vision = ""
         if self.config.get('enable_vision', False):
             text_month_vision = f"{vision_month} {localized_text('stats_vision', bot_language)}\n"
+
+        text_month_tts = ""
+        if self.config.get('enable_tts_generation', False):
+            text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n"
 
         # Check if image generation is enabled and, if so, generate the image statistics for the month
         text_month = (
             f"*{localized_text('usage_month', bot_language)}:*\n"
             f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
             f"{text_month_images}"  # Include the image statistics for the month if applicable
             f"{text_month_vision}"
+            f"{text_month_tts}"
             f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
             f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
             f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"

diff --git a/bot/usage_tracker.py b/bot/usage_tracker.py
@@ -56,16 +56,18 @@ def __init__(self, user_id, user_name, logs_dir="usage_logs"):
         if os.path.isfile(self.user_file):
             with open(self.user_file, "r") as file:
                 self.usage = json.load(file)
-                if 'vision_tokens' not in self.usage['usage_history']:
-                    self.usage['usage_history']['vision_tokens'] = {}
+            if 'vision_tokens' not in self.usage['usage_history']:
+                self.usage['usage_history']['vision_tokens'] = {}
+            if 'tts_characters' not in self.usage['usage_history']:
+                self.usage['usage_history']['tts_characters'] = {}
         else:
             # ensure directory exists
             pathlib.Path(logs_dir).mkdir(exist_ok=True)
             # create new dictionary for this user
             self.usage = {
                 "user_name": user_name,
                 "current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
-                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "vision_tokens":{}}
+                "usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}, "vision_tokens":{}}
             }
 
     # token usage functions:
@@ -194,6 +196,57 @@ def get_current_vision_tokens(self):
                 tokens_month += tokens
         return tokens_day, tokens_month
 
+    # tts usage functions:
+
+    def add_tts_request(self, text_length, tts_model, tts_prices):
+        tts_models = ['tts-1', 'tts-1-hd']
+        price = tts_prices[tts_models.index(tts_model)]
+        today = date.today()
+        tts_price = round(text_length * price / 1000, 2)
+        self.add_current_costs(tts_price)
+
+        if 'tts_characters' not in self.usage['usage_history']:
+            self.usage['usage_history']['tts_characters'] = {}
+
+        if tts_model not in self.usage['usage_history']['tts_characters']:
+            self.usage['usage_history']['tts_characters'][tts_model] = {}
+
+        # update usage_history
+        if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+            # add requested text length to existing date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length
+        else:
+            # create new entry for current date
+            self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length
+
+        # write updated token usage to user file
+        with open(self.user_file, "w") as outfile:
+            json.dump(self.usage, outfile)
+
+    def get_current_tts_usage(self):
+        """Get length of speech generated for today and this month.
+
+        :return: total amount of characters converted to speech per day and per month
+        """
+
+        tts_models = ['tts-1', 'tts-1-hd']
+        today = date.today()
+        characters_day = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"] and \
+                str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
+                characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)]
+
+        month = str(today)[:7]  # year-month as string
+        characters_month = 0
+        for tts_model in tts_models:
+            if tts_model in self.usage["usage_history"]["tts_characters"]: 
+                for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items():
+                    if today.startswith(month):
+                        characters_month += characters
+        return int(characters_day), int(characters_month)
+
+
     # transcription usage functions:
 
     def add_transcription_seconds(self, seconds, minute_price=0.006):
@@ -279,14 +332,15 @@ def get_current_cost(self):
         cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
         return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}
 
-    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01):
+    def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01, tts_prices='0.015,0.030'):
         """Get total USD amount of all requests in history
 
         :param tokens_price: price per 1000 tokens, defaults to 0.002
         :param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
             defaults to [0.016, 0.018, 0.02]
         :param minute_price: price per minute transcription, defaults to 0.006
-        :param vision_token_price: price per 1k vision token interpretation, defaults to 0.01
+        :param vision_token_price: price per 1K vision token interpretation, defaults to 0.01
+        :param tts_prices: price per 1K characters tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030]
         :return: total cost of all requests
         """
         total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
@@ -302,5 +356,9 @@ def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018
         total_vision_tokens = sum(self.usage['usage_history']['vision_tokens'].values())
         vision_cost = round(total_vision_tokens * vision_token_price / 1000, 2)
 
-        all_time_cost = token_cost + transcription_cost + image_cost + vision_cost
+        total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()]
+        tts_prices_list = [float(x) for x in tts_prices.split(',')]
+        tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2)
+
+        all_time_cost = token_cost + transcription_cost + image_cost + vision_cost + tts_cost
         return all_time_cost
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,4 @@ spotipy~=2.23.0
 pytube~=15.0.0
 gtts~=2.3.2
 whois~=0.9.27
-Pillow~=10.1.0
+Pillow~=10.1.0
diff --git a/translations.json b/translations.json
@@ -16,6 +16,7 @@
         "stats_tokens":"tokens",
         "stats_images":"images generated",
         "stats_vision":"image tokens interpreted",
+        "stats_tts":"characters converted to speech",
         "stats_transcribe":["minutes and", "seconds transcribed"],
         "stats_total":"💰 For a total amount of $",
         "stats_budget":"Your remaining budget",
@@ -28,6 +29,8 @@
         "image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
         "image_fail":"Failed to generate image",
         "vision_fail":"Failed to interpret image",
+        "tts_no_prompt":"Please provide text! (e.g. /tts my house)",
+        "tts_fail":"Failed to generate speech",
         "media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
         "media_type_fail":"Unsupported file type",
         "transcript":"Transcript",