Files
alloy-voice-assistant/assistant.py

172 lines
4.8 KiB
Python
Raw Normal View History

2024-05-27 15:57:16 -04:00
import base64
from threading import Lock, Thread
import cv2
2024-05-31 15:21:33 -04:00
import openai
2024-05-27 15:57:16 -04:00
from cv2 import VideoCapture, imencode
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
2024-10-21 08:14:29 -04:00
from langchain_openai import ChatOpenAI
2024-05-27 15:57:16 -04:00
from pyaudio import PyAudio, paInt16
from speech_recognition import Microphone, Recognizer, UnknownValueError
load_dotenv()
class WebcamStream:
def __init__(self):
self.stream = VideoCapture(index=0)
_, self.frame = self.stream.read()
self.running = False
self.lock = Lock()
def start(self):
if self.running:
return self
self.running = True
self.thread = Thread(target=self.update, args=())
self.thread.start()
return self
def update(self):
while self.running:
_, frame = self.stream.read()
self.lock.acquire()
self.frame = frame
self.lock.release()
def read(self, encode=False):
self.lock.acquire()
frame = self.frame.copy()
self.lock.release()
if encode:
_, buffer = imencode(".jpeg", frame)
return base64.b64encode(buffer)
return frame
def stop(self):
self.running = False
if self.thread.is_alive():
self.thread.join()
def __exit__(self, exc_type, exc_value, exc_traceback):
self.stream.release()
class Assistant:
def __init__(self, model):
self.chain = self._create_inference_chain(model)
def answer(self, prompt, image):
if not prompt:
return
print("Prompt:", prompt)
response = self.chain.invoke(
{"prompt": prompt, "image_base64": image.decode()},
config={"configurable": {"session_id": "unused"}},
).strip()
print("Response:", response)
2024-05-31 15:21:33 -04:00
if response:
self._tts(response)
2024-05-27 15:57:16 -04:00
def _tts(self, response):
player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)
2024-05-31 15:21:33 -04:00
with openai.audio.speech.with_streaming_response.create(
2024-05-27 15:57:16 -04:00
model="tts-1",
voice="alloy",
response_format="pcm",
input=response,
) as stream:
for chunk in stream.iter_bytes(chunk_size=1024):
player.write(chunk)
def _create_inference_chain(self, model):
SYSTEM_PROMPT = """
You are a witty assistant that will use the chat history and the image
2024-10-21 08:14:29 -04:00
provided by the user to answer its questions. Your job is to answer
questions.
2024-05-27 15:57:16 -04:00
Use few words on your answers. Go straight to the point. Do not use any
2024-10-21 08:14:29 -04:00
emoticons or emojis.
2024-05-27 15:57:16 -04:00
2024-10-21 08:14:29 -04:00
Be friendly and helpful. Show some personality.
2024-05-27 15:57:16 -04:00
"""
prompt_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=SYSTEM_PROMPT),
MessagesPlaceholder(variable_name="chat_history"),
(
"human",
[
{"type": "text", "text": "{prompt}"},
{
"type": "image_url",
"image_url": "data:image/jpeg;base64,{image_base64}",
},
],
),
]
)
chain = prompt_template | model | StrOutputParser()
chat_message_history = ChatMessageHistory()
return RunnableWithMessageHistory(
chain,
lambda _: chat_message_history,
input_messages_key="prompt",
history_messages_key="chat_history",
)
webcam_stream = WebcamStream().start()
2024-10-21 08:14:29 -04:00
# model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
2024-05-27 15:57:16 -04:00
# You can use OpenAI's GPT-4o model instead of Gemini Flash
# by uncommenting the following line:
2024-10-21 08:14:29 -04:00
model = ChatOpenAI(model="gpt-4o")
2024-05-27 15:57:16 -04:00
assistant = Assistant(model)
def audio_callback(recognizer, audio):
try:
prompt = recognizer.recognize_whisper(audio, model="base", language="english")
assistant.answer(prompt, webcam_stream.read(encode=True))
except UnknownValueError:
print("There was an error processing the audio.")
recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
recognizer.adjust_for_ambient_noise(source)
stop_listening = recognizer.listen_in_background(microphone, audio_callback)
while True:
cv2.imshow("webcam", webcam_stream.read())
if cv2.waitKey(1) in [27, ord("q")]:
break
webcam_stream.stop()
cv2.destroyAllWindows()
stop_listening(wait_for_stop=False)