Эх сурвалжийг харах

Refine logging and add image description feature

Enhanced the speech generation logging to display the word count of the input text instead of the full text. This change prioritizes user privacy and improves log readability. Implemented a new feature to generate descriptions for images within a conversation, expanding the bot's capabilities. Also, refactor `BaseTool` class to securely access arguments through `.get` method and to include `messages` by default, ensuring graceful handling of missing arguments.
Kumi 1 жил өмнө

+ 36 - 1

@@ -4,6 +4,7 @@ import tiktoken
 import asyncio
 import json
+import base64
 from functools import partial
 from contextlib import closing
@@ -387,7 +388,7 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
             bytes: The audio data.
-        self.logger.log(f"Generating speech from text '{text}'...")
+        self.logger.log(f"Generating speech from text of length: {len(text.split())} words...")
         speech = await self.openai_api.audio.speech.create(
@@ -475,3 +476,37 @@ Only the event_types mentioned above are allowed, you must not respond in any ot
         return images, len(images)
+    async def describe_images(self, messages: list, user: Optional[str] = None) -> Tuple[str, int]:
+        """Generate a description for an image.
+        Args:
+            image (bytes): The image data.
+        Returns:
+            Tuple[str, int]: The description and the number of tokens used.
+        """
+        self.logger.log(f"Generating description for images in conversation...")
+        system_message = "You are an image description generator. You generate descriptions for all images in the current conversation, one after another."
+        messages = [
+            {
+                "role": "system",
+                "content": system_message
+            }
+        ] + messages[1:]
+        if not "vision" in (chat_model := self.chat_model):
+            chat_model = self.chat_model + "gpt-4-vision-preview"
+        chat_partial = partial(
+            self.openai_api.chat.completions.create,
+                model=self.chat_model,
+                messages=messages,
+                user=user,
+        )
+        response = await self._request_with_retries(chat_partial)
+        return response.choices[0].message.content, response.usage.total_tokens

+ 4 - 3

@@ -4,9 +4,10 @@ class BaseTool:
     def __init__(self, **kwargs):
         self.kwargs = kwargs
-        self.bot = kwargs["bot"]
-        self.room = kwargs["room"]
-        self.user = kwargs["user"]
+        self.bot = kwargs.get("bot")
+        self.room = kwargs.get("room")
+        self.user = kwargs.get("user")
+        self.messages = kwargs.get("messages", [])
     async def run(self):
         raise NotImplementedError()

+ 5 - 14

@@ -1,24 +1,15 @@
 from .base import BaseTool, Handover
 class Imagedescription(BaseTool):
-    DESCRIPTION = "Describe the content of an image."
+    DESCRIPTION = "Describe the content of the images in the conversation."
         "type": "object",
         "properties": {
-            "image": {
-                "type": "string",
-                "description": "The image to describe.",
-            },
-        "required": ["image"],
     async def run(self):
-        """Describe an image.
-        This tool only hands over to the original model, if applicable.
-        It is intended to handle the case where GPT-3 thinks it is asked to
-        *generate* an image, but the user actually wants to *describe* an
-        image...
-        """
-        raise Handover()
+        """Describe images in the conversation."""
+        image_api = self.bot.image_api
+        return (await image_api.describe_images(self.messages, self.user))[0]