commit 85c7434fca73a8bedd5eb1aa9269a908418740b0 Author: Santiago L. Valdarrama Date: Fri May 24 11:54:36 2024 -0400 ... diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c13355 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +/.venv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d78ea64 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ + + +Create a virtual environment and update pip: + +``` +$ python3 -m venv .venv +$ source .venv/bin/activate +$ pip install -U pip +``` \ No newline at end of file diff --git a/omni.py b/omni.py new file mode 100644 index 0000000..acca6fd --- /dev/null +++ b/omni.py @@ -0,0 +1,131 @@ +import base64 + +import openai +import pyaudio +import speech_recognition as sr +import whisper +from cv2 import VideoCapture, destroyWindow, imencode, imshow, waitKey +from dotenv import load_dotenv +from langchain.memory import ChatMessageHistory +from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain.schema.messages import SystemMessage +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables.history import RunnableWithMessageHistory +from langchain_openai.chat_models import ChatOpenAI + +load_dotenv() + +CAM_PORT = 0 + +current_frame = None + +whisper_model = whisper.load_model("small") +model = ChatOpenAI(model="gpt-4o") + +SYSTEM_PROMPT = """ +You are a witty assistant that will use the image provided by the user +to answer its questions. + +Use few words on your questions. Go straight to the point. Do not use any +emoticons or emojis. Do not ask follow-up questions. + +You have memory, so you can use the entire context of the conversation to +tailor your answers. You can also use the image provided by the user. +""" + +prompt_template = ChatPromptTemplate.from_messages( + [ + SystemMessage(content=SYSTEM_PROMPT), + MessagesPlaceholder(variable_name="chat_history"), + ( + "human", + [ + {"type": "text", "text": "{question}"}, + { + "type": "image_url", + "image_url": "data:image/jpeg;base64,{image_base64}", + }, + ], + ), + ] +) + +parser = StrOutputParser() +chain = prompt_template | model | parser + +chat_message_history = ChatMessageHistory() + +chain_with_history = RunnableWithMessageHistory( + chain, + lambda session_id: chat_message_history, + input_messages_key="question", + history_messages_key="chat_history", +) + + +def inference(text, image): + print("Sending request to the model...") + + response = chain_with_history.invoke( + {"question": text, "image_base64": image.decode()}, + config={"configurable": {"session_id": "unused"}}, + ) + + return response + + +def tts(text): + player_stream = pyaudio.PyAudio().open( + format=pyaudio.paInt16, channels=1, rate=24000, output=True + ) + + with openai.audio.speech.with_streaming_response.create( + model="tts-1", + voice="alloy", + response_format="pcm", + input=text, + ) as response: + for chunk in response.iter_bytes(chunk_size=1024): + player_stream.write(chunk) + + +def audio_callback(recognizer, audio): + try: + prompt = recognizer.recognize_whisper(audio, language="english") + + if prompt: + print("Prompt:", prompt) + + _, buffer = imencode(".jpeg", current_frame) + encoded_image = base64.b64encode(buffer) + + result = inference(prompt, encoded_image) + print(result) + + tts(result) + + except sr.UnknownValueError: + print("We couldn't understand audio") + + +r = sr.Recognizer() +m = sr.Microphone() +with m as source: + r.adjust_for_ambient_noise(source) + +stop_listening = r.listen_in_background(m, audio_callback) + +# calling this function requests that the background listener stop listening +# stop_listening(wait_for_stop=False) + +while True: + cam = VideoCapture(CAM_PORT) + result, current_frame = cam.read() + + if result: + imshow("webcam", current_frame) + + if waitKey(1) & 0xFF == ord("q"): + break + +destroyWindow("webcam") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..af9c48c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +opencv-python +langchain +langchain-openai +langchain-community +python-dotenv +pyaudio +SpeechRecognition +git+https://github.com/openai/whisper.git \ No newline at end of file