image to text support, closes #134

madox2 · Dec 20, 2024 · 2643c4f · 2643c4f
1 parent 933a90d
commit 2643c4f
Show file tree

Hide file tree

Showing 6 changed files with 345 additions and 107 deletions.
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ To get an idea what is possible to do with AI commands see the [prompts](https:/
 - Edit selected text in-place with AI
 - Interactive conversation with ChatGPT
 - Custom roles
+- Vision capabilities (image to text)
 - Integrates with any OpenAI-compatible API
 
 ## How it works
@@ -194,7 +195,7 @@ You are a Clean Code expert, I have the following code, please refactor it in a
 
 ```
 
-To include files in the chat a special `include` role is used:
+To include files in the chat a special `include` section is used:
 
 ```
 >>> user
@@ -207,9 +208,22 @@ Generate documentation for the following files
 /home/user/myproject/**/*.py
 ```
 
-Each file's contents will be added to an additional `user` role message with the files separated by `==> {path} <==`, where path is the path to the file. Globbing is expanded out via `glob.gob` and relative paths to the current working directory (as determined by `getcwd()`) will be resolved to absolute paths.
+Each file's contents will be added to an additional user message with `==> {path} <==` header, relative paths are resolved to the current working directory.
 
-Supported chat roles are **`>>> system`**, **`>>> user`**, **`>>> include`** and **`<<< assistant`**
+
+To use image vision capabilities (image to text) include an image file:
+
+```
+>>> user
+
+What object is on the image?
+
+>>> include
+
+~/myimage.jpg
+```
+
+Supported chat sections are **`>>> system`**, **`>>> user`**, **`>>> include`** and **`<<< assistant`**
 
 ### `:AIRedo`
 

diff --git a/py/chat.py b/py/chat.py
@@ -56,7 +56,8 @@ def initialize_chat_window():
     messages = initial_messages + chat_messages
 
     try:
-        if messages[-1]["content"].strip():
+        last_content = messages[-1]["content"][-1]
+        if last_content['type'] != 'text' or last_content['text']:
             vim.command("normal! Go\n<<< assistant\n\n")
             vim.command("redraw")
 

diff --git a/py/utils.py b/py/utils.py
@@ -12,6 +12,7 @@
 from urllib.error import HTTPError
 import traceback
 import configparser
+import base64
 
 utils_py_imported = True
 
@@ -109,63 +110,84 @@ def render_text_chunks(chunks):
     if not full_text.strip():
         raise KnownError('Empty response received. Tip: You can try modifying the prompt and retry.')
 
+def encode_image(image_path):
+    """Encodes an image file to a base64 string."""
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
 
-def parse_chat_messages(chat_content):
-    lines = chat_content.splitlines()
-    messages = []
-    for line in lines:
-        if line.startswith(">>> system"):
-            messages.append({"role": "system", "content": ""})
-            continue
-        if line.startswith(">>> user"):
-            messages.append({"role": "user", "content": ""})
-            continue
-        if line.startswith(">>> include"):
-            messages.append({"role": "include", "content": ""})
-            continue
-        if line.startswith("<<< assistant"):
-            messages.append({"role": "assistant", "content": ""})
-            continue
-        if not messages:
-            continue
-        messages[-1]["content"] += "\n" + line
 
-    for message in messages:
-        # strip newlines from the content as it causes empty responses
-        message["content"] = message["content"].strip()
+def is_image_path(path):
+    ext = path.strip().split('.')[-1]
+    return ext in ['jpg', 'jpeg', 'png', 'gif']
 
-        if message["role"] == "include":
-            message["role"] = "user"
-            paths = message["content"].split("\n")
-            message["content"] = ""
+def parse_include_paths(path):
+    if not path:
+        return []
+    pwd = vim.eval('getcwd()')
 
-            pwd = vim.eval("getcwd()")
-            for i in range(len(paths)):
-                path = os.path.expanduser(paths[i])
-                if not os.path.isabs(path):
-                    path = os.path.join(pwd, path)
+    path = os.path.expanduser(path)
+    if not os.path.isabs(path):
+        path = os.path.join(pwd, path)
 
-                paths[i] = path
+    expanded_paths = [path]
+    if '*' in path:
+        expanded_paths = glob.glob(path, recursive=True)
 
-                if '**' in path:
-                    paths[i] = None
-                    paths.extend(glob.glob(path, recursive=True))
+    return [path for path in expanded_paths if not os.path.isdir(path)]
 
-            for path in paths:
-                if path is None:
-                    continue
+def make_image_message(path):
+    ext = path.split('.')[-1]
+    base64_image = encode_image(path)
+    return { 'type': 'image_url', 'image_url': { 'url': f"data:image/{ext.replace('.', '')};base64,{base64_image}" } }
+
+def make_text_file_message(path):
+    try:
+        with open(path, 'r') as file:
+            file_content = file.read().strip()
+            return { 'type': 'text', 'text': f'==> {path} <==\n' + file_content.strip() }
+    except UnicodeDecodeError:
+        return { 'type': 'text', 'text': f'==> {path} <==\nBinary file, cannot display' }
+
+def parse_chat_messages(chat_content):
+    lines = chat_content.splitlines()
+    messages = []
 
-                if os.path.isdir(path):
+    current_type = ''
+    for line in lines:
+        match line:
+            case '>>> system':
+                messages.append({'role': 'system', 'content': [{ 'type': 'text', 'text': '' }]})
+                current_type = 'system'
+            case '<<< assistant':
+                messages.append({'role': 'assistant', 'content': [{ 'type': 'text', 'text': '' }]})
+                current_type = 'assistant'
+            case '>>> user':
+                if messages and messages[-1]['role'] == 'user':
+                    messages[-1]['content'].append({ 'type': 'text', 'text': '' })
+                else:
+                    messages.append({'role': 'user', 'content': [{ 'type': 'text', 'text': '' }]})
+                current_type = 'user'
+            case '>>> include':
+                if not messages or messages[-1]['role'] != 'user':
+                    messages.append({'role': 'user', 'content': []})
+                current_type = 'include'
+            case _:
+                if not messages:
                     continue
+                match current_type:
+                    case 'assistant' | 'system' | 'user':
+                        messages[-1]['content'][-1]['text'] += '\n' + line
+                    case 'include':
+                        paths = parse_include_paths(line)
+                        for path in paths:
+                            content = make_image_message(path) if is_image_path(path) else make_text_file_message(path)
+                            messages[-1]['content'].append(content)
 
-                try:
-                    with open(path, "r") as file:
-                        file_content = file.read().strip()
-                        message["content"] += f"\n\n==> {path} <==\n" + file_content
-                except UnicodeDecodeError:
-                    message["content"] += "\n\n" + f"==> {path} <=="
-                    message["content"] += "\n" + "Binary file, cannot display"
-            message['content'] = message['content'].strip()
+    for message in messages:
+        # strip newlines from the text content as it causes empty responses
+        for content in message['content']:
+            if content['type'] == 'text':
+                content['text'] = content['text'].strip()
 
     return messages