2024-05-27 15:57:16 -04:00
|
|
|
import base64
|
|
|
|
|
from threading import Lock, Thread
|
|
|
|
|
|
|
|
|
|
import cv2
|
2024-05-31 15:21:33 -04:00
|
|
|
import openai
|
2024-05-27 15:57:16 -04:00
|
|
|
from cv2 import VideoCapture, imencode
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
|
|
|
from langchain.schema.messages import SystemMessage
|
|
|
|
|
from langchain_community.chat_message_histories import ChatMessageHistory
|
|
|
|
|
from langchain_core.output_parsers import StrOutputParser
|
|
|
|
|
from langchain_core.runnables.history import RunnableWithMessageHistory
|
2024-10-21 08:14:29 -04:00
|
|
|
from langchain_openai import ChatOpenAI
|
2024-05-27 15:57:16 -04:00
|
|
|
from pyaudio import PyAudio, paInt16
|
|
|
|
|
from speech_recognition import Microphone, Recognizer, UnknownValueError
|
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WebcamStream:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.stream = VideoCapture(index=0)
|
|
|
|
|
_, self.frame = self.stream.read()
|
|
|
|
|
self.running = False
|
|
|
|
|
self.lock = Lock()
|
|
|
|
|
|
|
|
|
|
def start(self):
|
|
|
|
|
if self.running:
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
self.running = True
|
|
|
|
|
|
|
|
|
|
self.thread = Thread(target=self.update, args=())
|
|
|
|
|
self.thread.start()
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def update(self):
|
|
|
|
|
while self.running:
|
|
|
|
|
_, frame = self.stream.read()
|
|
|
|
|
|
|
|
|
|
self.lock.acquire()
|
|
|
|
|
self.frame = frame
|
|
|
|
|
self.lock.release()
|
|
|
|
|
|
|
|
|
|
def read(self, encode=False):
|
|
|
|
|
self.lock.acquire()
|
|
|
|
|
frame = self.frame.copy()
|
|
|
|
|
self.lock.release()
|
|
|
|
|
|
|
|
|
|
if encode:
|
|
|
|
|
_, buffer = imencode(".jpeg", frame)
|
|
|
|
|
return base64.b64encode(buffer)
|
|
|
|
|
|
|
|
|
|
return frame
|
|
|
|
|
|
|
|
|
|
def stop(self):
|
|
|
|
|
self.running = False
|
|
|
|
|
if self.thread.is_alive():
|
|
|
|
|
self.thread.join()
|
|
|
|
|
|
|
|
|
|
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
|
|
|
self.stream.release()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Assistant:
|
|
|
|
|
def __init__(self, model):
|
|
|
|
|
self.chain = self._create_inference_chain(model)
|
|
|
|
|
|
|
|
|
|
def answer(self, prompt, image):
|
|
|
|
|
if not prompt:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
print("Prompt:", prompt)
|
|
|
|
|
|
|
|
|
|
response = self.chain.invoke(
|
|
|
|
|
{"prompt": prompt, "image_base64": image.decode()},
|
|
|
|
|
config={"configurable": {"session_id": "unused"}},
|
|
|
|
|
).strip()
|
|
|
|
|
|
|
|
|
|
print("Response:", response)
|
|
|
|
|
|
2024-05-31 15:21:33 -04:00
|
|
|
if response:
|
|
|
|
|
self._tts(response)
|
2024-05-27 15:57:16 -04:00
|
|
|
|
|
|
|
|
def _tts(self, response):
|
|
|
|
|
player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)
|
|
|
|
|
|
2024-05-31 15:21:33 -04:00
|
|
|
with openai.audio.speech.with_streaming_response.create(
|
2024-05-27 15:57:16 -04:00
|
|
|
model="tts-1",
|
|
|
|
|
voice="alloy",
|
|
|
|
|
response_format="pcm",
|
|
|
|
|
input=response,
|
|
|
|
|
) as stream:
|
|
|
|
|
for chunk in stream.iter_bytes(chunk_size=1024):
|
|
|
|
|
player.write(chunk)
|
|
|
|
|
|
|
|
|
|
def _create_inference_chain(self, model):
|
|
|
|
|
SYSTEM_PROMPT = """
|
|
|
|
|
You are a witty assistant that will use the chat history and the image
|
2024-10-21 08:14:29 -04:00
|
|
|
provided by the user to answer its questions. Your job is to answer
|
|
|
|
|
questions.
|
2024-05-27 15:57:16 -04:00
|
|
|
|
|
|
|
|
Use few words on your answers. Go straight to the point. Do not use any
|
2024-10-21 08:14:29 -04:00
|
|
|
emoticons or emojis.
|
2024-05-27 15:57:16 -04:00
|
|
|
|
2024-10-21 08:14:29 -04:00
|
|
|
Be friendly and helpful. Show some personality.
|
2024-05-27 15:57:16 -04:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
prompt_template = ChatPromptTemplate.from_messages(
|
|
|
|
|
[
|
|
|
|
|
SystemMessage(content=SYSTEM_PROMPT),
|
|
|
|
|
MessagesPlaceholder(variable_name="chat_history"),
|
|
|
|
|
(
|
|
|
|
|
"human",
|
|
|
|
|
[
|
|
|
|
|
{"type": "text", "text": "{prompt}"},
|
|
|
|
|
{
|
|
|
|
|
"type": "image_url",
|
|
|
|
|
"image_url": "data:image/jpeg;base64,{image_base64}",
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
),
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chain = prompt_template | model | StrOutputParser()
|
|
|
|
|
|
|
|
|
|
chat_message_history = ChatMessageHistory()
|
|
|
|
|
return RunnableWithMessageHistory(
|
|
|
|
|
chain,
|
|
|
|
|
lambda _: chat_message_history,
|
|
|
|
|
input_messages_key="prompt",
|
|
|
|
|
history_messages_key="chat_history",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
webcam_stream = WebcamStream().start()
|
|
|
|
|
|
2024-10-21 08:14:29 -04:00
|
|
|
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
|
2024-05-27 15:57:16 -04:00
|
|
|
|
|
|
|
|
# You can use OpenAI's GPT-4o model instead of Gemini Flash
|
|
|
|
|
# by uncommenting the following line:
|
2024-10-21 08:14:29 -04:00
|
|
|
model = ChatOpenAI(model="gpt-4o")
|
2024-05-27 15:57:16 -04:00
|
|
|
|
|
|
|
|
assistant = Assistant(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def audio_callback(recognizer, audio):
|
|
|
|
|
try:
|
|
|
|
|
prompt = recognizer.recognize_whisper(audio, model="base", language="english")
|
|
|
|
|
assistant.answer(prompt, webcam_stream.read(encode=True))
|
|
|
|
|
|
|
|
|
|
except UnknownValueError:
|
|
|
|
|
print("There was an error processing the audio.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recognizer = Recognizer()
|
|
|
|
|
microphone = Microphone()
|
|
|
|
|
with microphone as source:
|
|
|
|
|
recognizer.adjust_for_ambient_noise(source)
|
|
|
|
|
|
|
|
|
|
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
cv2.imshow("webcam", webcam_stream.read())
|
|
|
|
|
if cv2.waitKey(1) in [27, ord("q")]:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
webcam_stream.stop()
|
|
|
|
|
cv2.destroyAllWindows()
|
|
|
|
|
stop_listening(wait_for_stop=False)
|