Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vision models support (WIP) #457

Merged
merged 12 commits into from
Dec 10, 2023
Prev Previous commit
Next Next commit
Merge branch 'main' into vision-support
  • Loading branch information
gilcu3 committed Nov 18, 2023
commit 28ec9f9eefba8ecb8698ebe1bf1d016f8057e9fa
8 changes: 7 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,15 @@ ALLOWED_TELEGRAM_USER_IDS=USER_ID_1,USER_ID_2
# TEMPERATURE=1.0
# PRESENCE_PENALTY=0.0
# FREQUENCY_PENALTY=0.0
# IMAGE_SIZE=512x512
# IMAGE_MODEL=dall-e-3
# IMAGE_QUALITY=hd
# IMAGE_STYLE=natural
# IMAGE_SIZE=1024x1024
# IMAGE_FORMAT=document
# VISION_DETAIL="low"
# GROUP_TRIGGER_KEYWORD=""
# IGNORE_GROUP_TRANSCRIPTIONS=true
# IGNORE_GROUP_VISION=true
# TTS_MODEL="tts-1"
# TTS_VOICE="alloy"
# BOT_LANGUAGE=en
56 changes: 32 additions & 24 deletions README.md

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions bot/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def main():
'vision_prompt': os.environ.get('VISION_PROMPT', 'What is in this image'),
'vision_detail': os.environ.get('VISION_DETAIL', 'low'),
'vision_max_tokens': int(os.environ.get('VISION_MAX_TOKENS', '300')),
'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
'tts_voice': os.environ.get('TTS_VOICE', 'alloy'),
}

if openai_config['enable_functions'] and not functions_available:
Expand All @@ -77,6 +79,7 @@ def main():
'enable_image_generation': os.environ.get('ENABLE_IMAGE_GENERATION', 'true').lower() == 'true',
'enable_transcription': os.environ.get('ENABLE_TRANSCRIPTION', 'true').lower() == 'true',
'enable_vision': os.environ.get('ENABLE_VISION', 'true').lower() == 'true',
'enable_tts_generation': os.environ.get('ENABLE_TTS_GENERATION', 'true').lower() == 'true',
'budget_period': os.environ.get('BUDGET_PERIOD', 'monthly').lower(),
'user_budgets': os.environ.get('USER_BUDGETS', os.environ.get('MONTHLY_USER_BUDGETS', '*')),
'guest_budget': float(os.environ.get('GUEST_BUDGET', os.environ.get('MONTHLY_GUEST_BUDGET', '100.0'))),
Expand All @@ -90,6 +93,9 @@ def main():
'token_price': float(os.environ.get('TOKEN_PRICE', 0.002)),
'image_prices': [float(i) for i in os.environ.get('IMAGE_PRICES', "0.016,0.018,0.02").split(",")],
'vision_token_price': float(os.environ.get('VISION_TOKEN_PRICE', '0.01')),
'image_receive_mode': os.environ.get('IMAGE_FORMAT', "photo"),
'tts_model': os.environ.get('TTS_MODEL', 'tts-1'),
'tts_prices': [float(i) for i in os.environ.get('TTS_PRICES', "0.015,0.030").split(",")],
'transcription_price': float(os.environ.get('TRANSCRIPTION_PRICE', 0.006)),
'bot_language': os.environ.get('BOT_LANGUAGE', 'en'),
}
Expand Down
15 changes: 10 additions & 5 deletions bot/openai_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
GPT_4_MODELS = ("gpt-4", "gpt-4-0314", "gpt-4-0613")
GPT_4_32K_MODELS = ("gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-0613")
GPT_4_VISION_MODELS = ("gpt-4-vision-preview",)
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS
GPT_4_128K_MODELS = ("gpt-4-1106-preview",)
GPT_ALL_MODELS = GPT_3_MODELS + GPT_3_16K_MODELS + GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS


def default_max_tokens(model: str) -> int:
Expand All @@ -48,6 +49,8 @@ def default_max_tokens(model: str) -> int:
return base * 8
elif model in GPT_4_VISION_MODELS:
return 4096
elif model in GPT_4_128K_MODELS:
return 4096


def are_functions_available(model: str) -> bool:
Expand Down Expand Up @@ -400,7 +403,7 @@ async def interpret_image(self, chat_id, fileobj, prompt=None):
message = {'role':'user', 'content':[{'type':'text', 'text':prompt}, {'type':'image_url', \
'image_url': {'url':f'data:image/jpeg;base64,{image}', 'detail':self.config['vision_detail'] } }]}
common_args = {
'model': self.config['model'],
'model': 'gpt-4-vision-preview', # the only one that currently makes sense here
'messages': self.conversations[chat_id] + [message],
'temperature': self.config['temperature'],
'n': 1, # several choices is not implemented yet
Expand Down Expand Up @@ -493,6 +496,8 @@ def __max_model_tokens(self):
return base * 8
if self.config['model'] in GPT_4_VISION_MODELS:
return base * 31
if self.config['model'] in GPT_4_128K_MODELS:
return base * 31
raise NotImplementedError(
f"Max tokens for model {self.config['model']} is not implemented yet."
)
Expand All @@ -513,7 +518,7 @@ def __count_tokens(self, messages) -> int:
if model in GPT_3_MODELS + GPT_3_16K_MODELS:
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
tokens_per_name = -1 # if there's a name, the role is omitted
elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS:
elif model in GPT_4_MODELS + GPT_4_32K_MODELS + GPT_4_VISION_MODELS + GPT_4_128K_MODELS:
tokens_per_message = 3
tokens_per_name = 1
else:
Expand All @@ -535,9 +540,9 @@ def __count_tokens_vision(self, fileobj) -> int:
:return: the number of tokens required
"""
image = Image.open(fileobj)
model = self.config['model']
model = 'gpt-4-vision-preview' # fixed for now
if model not in GPT_4_VISION_MODELS:
raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}.""")
raise NotImplementedError(f"""count_tokens_vision() is not implemented for model {model}.""")

w, h = image.size
if w > h: w, h = h, w
Expand Down
11 changes: 11 additions & 0 deletions bot/telegram_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
(transcribe_minutes_today, transcribe_seconds_today, transcribe_minutes_month,
transcribe_seconds_month) = self.usage[user_id].get_current_transcription_duration()
vision_today, vision_month = self.usage[user_id].get_current_vision_tokens()
characters_today, characters_month = self.usage[user_id].get_current_tts_usage()
current_cost = self.usage[user_id].get_current_cost()

chat_id = update.effective_chat.id
Expand All @@ -122,12 +123,17 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
text_today_vision = ""
if self.config.get('enable_vision', False):
text_today_vision = f"{vision_today} {localized_text('stats_vision', bot_language)}\n"

text_today_tts = ""
if self.config.get('enable_tts_generation', False):
text_today_tts = f"{characters_today} {localized_text('stats_tts', bot_language)}\n"

text_today = (
f"*{localized_text('usage_today', bot_language)}:*\n"
f"{tokens_today} {localized_text('stats_tokens', bot_language)}\n"
f"{text_today_images}" # Include the image statistics for today if applicable
f"{text_today_vision}"
f"{text_today_tts}"
f"{transcribe_minutes_today} {localized_text('stats_transcribe', bot_language)[0]} "
f"{transcribe_seconds_today} {localized_text('stats_transcribe', bot_language)[1]}\n"
f"{localized_text('stats_total', bot_language)}{current_cost['cost_today']:.2f}\n"
Expand All @@ -141,13 +147,18 @@ async def stats(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
text_month_vision = ""
if self.config.get('enable_vision', False):
text_month_vision = f"{vision_month} {localized_text('stats_vision', bot_language)}\n"

text_month_tts = ""
if self.config.get('enable_tts_generation', False):
text_month_tts = f"{characters_month} {localized_text('stats_tts', bot_language)}\n"

# Check if image generation is enabled and, if so, generate the image statistics for the month
text_month = (
f"*{localized_text('usage_month', bot_language)}:*\n"
f"{tokens_month} {localized_text('stats_tokens', bot_language)}\n"
f"{text_month_images}" # Include the image statistics for the month if applicable
f"{text_month_vision}"
f"{text_month_tts}"
f"{transcribe_minutes_month} {localized_text('stats_transcribe', bot_language)[0]} "
f"{transcribe_seconds_month} {localized_text('stats_transcribe', bot_language)[1]}\n"
f"{localized_text('stats_total', bot_language)}{current_cost['cost_month']:.2f}"
Expand Down
70 changes: 64 additions & 6 deletions bot/usage_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,18 @@ def __init__(self, user_id, user_name, logs_dir="usage_logs"):
if os.path.isfile(self.user_file):
with open(self.user_file, "r") as file:
self.usage = json.load(file)
if 'vision_tokens' not in self.usage['usage_history']:
self.usage['usage_history']['vision_tokens'] = {}
if 'vision_tokens' not in self.usage['usage_history']:
self.usage['usage_history']['vision_tokens'] = {}
if 'tts_characters' not in self.usage['usage_history']:
self.usage['usage_history']['tts_characters'] = {}
else:
# ensure directory exists
pathlib.Path(logs_dir).mkdir(exist_ok=True)
# create new dictionary for this user
self.usage = {
"user_name": user_name,
"current_cost": {"day": 0.0, "month": 0.0, "all_time": 0.0, "last_update": str(date.today())},
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "vision_tokens":{}}
"usage_history": {"chat_tokens": {}, "transcription_seconds": {}, "number_images": {}, "tts_characters": {}, "vision_tokens":{}}
}

# token usage functions:
Expand Down Expand Up @@ -194,6 +196,57 @@ def get_current_vision_tokens(self):
tokens_month += tokens
return tokens_day, tokens_month

# tts usage functions:

def add_tts_request(self, text_length, tts_model, tts_prices):
tts_models = ['tts-1', 'tts-1-hd']
price = tts_prices[tts_models.index(tts_model)]
today = date.today()
tts_price = round(text_length * price / 1000, 2)
self.add_current_costs(tts_price)

if 'tts_characters' not in self.usage['usage_history']:
self.usage['usage_history']['tts_characters'] = {}

if tts_model not in self.usage['usage_history']['tts_characters']:
self.usage['usage_history']['tts_characters'][tts_model] = {}

# update usage_history
if str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
# add requested text length to existing date
self.usage["usage_history"]["tts_characters"][tts_model][str(today)] += text_length
else:
# create new entry for current date
self.usage["usage_history"]["tts_characters"][tts_model][str(today)] = text_length

# write updated token usage to user file
with open(self.user_file, "w") as outfile:
json.dump(self.usage, outfile)

def get_current_tts_usage(self):
"""Get length of speech generated for today and this month.

:return: total amount of characters converted to speech per day and per month
"""

tts_models = ['tts-1', 'tts-1-hd']
today = date.today()
characters_day = 0
for tts_model in tts_models:
if tts_model in self.usage["usage_history"]["tts_characters"] and \
str(today) in self.usage["usage_history"]["tts_characters"][tts_model]:
characters_day += self.usage["usage_history"]["tts_characters"][tts_model][str(today)]

month = str(today)[:7] # year-month as string
characters_month = 0
for tts_model in tts_models:
if tts_model in self.usage["usage_history"]["tts_characters"]:
for today, characters in self.usage["usage_history"]["tts_characters"][tts_model].items():
if today.startswith(month):
characters_month += characters
return int(characters_day), int(characters_month)


# transcription usage functions:

def add_transcription_seconds(self, seconds, minute_price=0.006):
Expand Down Expand Up @@ -279,14 +332,15 @@ def get_current_cost(self):
cost_all_time = self.usage["current_cost"].get("all_time", self.initialize_all_time_cost())
return {"cost_today": cost_day, "cost_month": cost_month, "cost_all_time": cost_all_time}

def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01):
def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018,0.02", minute_price=0.006, vision_token_price=0.01, tts_prices='0.015,0.030'):
"""Get total USD amount of all requests in history

:param tokens_price: price per 1000 tokens, defaults to 0.002
:param image_prices: prices for images of sizes ["256x256", "512x512", "1024x1024"],
defaults to [0.016, 0.018, 0.02]
:param minute_price: price per minute transcription, defaults to 0.006
:param vision_token_price: price per 1k vision token interpretation, defaults to 0.01
:param vision_token_price: price per 1K vision token interpretation, defaults to 0.01
:param tts_prices: price per 1K characters tts per model ['tts-1', 'tts-1-hd'], defaults to [0.015, 0.030]
:return: total cost of all requests
"""
total_tokens = sum(self.usage['usage_history']['chat_tokens'].values())
Expand All @@ -302,5 +356,9 @@ def initialize_all_time_cost(self, tokens_price=0.002, image_prices="0.016,0.018
total_vision_tokens = sum(self.usage['usage_history']['vision_tokens'].values())
vision_cost = round(total_vision_tokens * vision_token_price / 1000, 2)

all_time_cost = token_cost + transcription_cost + image_cost + vision_cost
total_characters = [sum(tts_model.values()) for tts_model in self.usage['usage_history']['tts_characters'].values()]
tts_prices_list = [float(x) for x in tts_prices.split(',')]
tts_cost = round(sum([count * price / 1000 for count, price in zip(total_characters, tts_prices_list)]), 2)

all_time_cost = token_cost + transcription_cost + image_cost + vision_cost + tts_cost
return all_time_cost
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ spotipy~=2.23.0
pytube~=15.0.0
gtts~=2.3.2
whois~=0.9.27
Pillow~=10.1.0
Pillow~=10.1.0
3 changes: 3 additions & 0 deletions translations.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"stats_tokens":"tokens",
"stats_images":"images generated",
"stats_vision":"image tokens interpreted",
"stats_tts":"characters converted to speech",
"stats_transcribe":["minutes and", "seconds transcribed"],
"stats_total":"💰 For a total amount of $",
"stats_budget":"Your remaining budget",
Expand All @@ -28,6 +29,8 @@
"image_no_prompt":"Please provide a prompt! (e.g. /image cat)",
"image_fail":"Failed to generate image",
"vision_fail":"Failed to interpret image",
"tts_no_prompt":"Please provide text! (e.g. /tts my house)",
"tts_fail":"Failed to generate speech",
"media_download_fail":["Failed to download audio file", "Make sure the file is not too large. (max 20MB)"],
"media_type_fail":"Unsupported file type",
"transcript":"Transcript",
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.