GPT-4をPythonで使う（再戦）　その３：ビジョン判定をする

実際のところ

前提条件

公式サンプルで引用されているWikipediaの画像をsample.jpgとしてローカルに保存

スクリプト

from openai import OpenAI
from dotenv import dotenv_values
import sys
import base64

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

def detectImage(prompt, client):
  response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[
      {
        "role": "user",
        "content": [
          {"type": "text", "text": "How many person in this image?"},
          {
            "type": "image_url",
            "image_url": { "url": f"data:image/jpeg;base64,{base64_image}",},
          },
        ],
      }
    ],
    max_tokens=300,
  )
  return response

# Path to your image
mydata = ""
if len(sys.argv) > 1:
   mydata = sys.argv[1]
else:
   mydata = "./sample.jpg"

# Getting the base64 string
base64_image = encode_image(mydata)

config = dotenv_values(".env")

yourclient = OpenAI( api_key= config["OPENAI_API_KEY"] )

result = detectImage(mydata,yourclient)
comments = result.choices[0].message.content
print(comments)

試しに実行すると

$ python3 visionTest.py
There are no people visible in this image. It features a wooden path leading through a grassy area under a blue sky with some clouds.