diff --git a/.gitignore b/.gitignore index b6e4761..3a7344b 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,9 @@ dmypy.json # Pyre type checker .pyre/ + +ffcache* +env.bat +*.mp3 +*.wav +test.py \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..148d41c --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "editor.formatOnSave": true, + "python.formatting.provider": "black", + "python.formatting.blackArgs": ["--line-length", "120"], + "python.linting.pylintUseMinimalCheckers": false, + "editor.showUnused": true +} diff --git a/convo/actors.py b/convo/actors.py new file mode 100644 index 0000000..bb315d4 --- /dev/null +++ b/convo/actors.py @@ -0,0 +1,42 @@ +from typing import List +from abc import ABC, abstractmethod + +from convo.audio_input import WhisperMicrophone +from convo.audio_output import TTSSpeaker +from convo.openai_io import OpenAIChatCompletion + + +class ChatAgent(ABC): + @abstractmethod + def get_response(self, transcript: List[str]) -> str: + pass + + +class MicrophoneInSpeakerTTSOut(ChatAgent): + def __init__(self): + self.mic = WhisperMicrophone() + self.speaker = TTSSpeaker() + + def get_response(self, transcript: List[str]) -> str: + if len(transcript) > 0: + self.speaker.play(transcript[-1]) + return self.mic.get_transcription() + + +class TerminalInPrintOut(ChatAgent): + def get_response(self, transcript: List[str]) -> str: + if len(transcript) > 0: + print(transcript[-1]) + return input(" response > ") + + +class OpenAIChat(ChatAgent): + def __init__(self): + self.openai_chat = OpenAIChatCompletion() + + def get_response(self, transcript: List[str]) -> str: + if len(transcript) > 0: + response = self.openai_chat.get_response(transcript) + else: + response = "Generic" + return response diff --git a/convo/audio_input.py b/convo/audio_input.py new file mode 100644 index 0000000..e58f90a --- /dev/null +++ b/convo/audio_input.py @@ -0,0 +1,29 @@ +import io +import os +import tempfile + +from pydub import AudioSegment +import speech_recognition as sr +import whisper + + +class WhisperMicrophone: + def __init__(self): + self.audio_model = whisper.load_model("tiny") + self.recognizer = sr.Recognizer() + self.recognizer.energy_threshold = 500 + self.recognizer.pause_threshold = 0.8 + self.recognizer.dynamic_energy_threshold = False + + def get_transcription(self) -> str: + with sr.Microphone(sample_rate=16000) as source: + print("Waiting for mic...") + with tempfile.TemporaryDirectory() as tmp: + tmp_path = os.path.join(tmp, "mic.wav") + audio = self.recognizer.listen(source) + data = io.BytesIO(audio.get_wav_data()) + audio_clip = AudioSegment.from_file(data) + audio_clip.export(tmp_path, format="wav") + result = self.audio_model.transcribe(tmp_path, language="english") + predicted_text = result["text"] + return predicted_text diff --git a/convo/audio_output.py b/convo/audio_output.py new file mode 100644 index 0000000..de449ff --- /dev/null +++ b/convo/audio_output.py @@ -0,0 +1,37 @@ +import os +import tempfile +import subprocess + +from gtts import gTTS +import pyaudio +import wave + + +class TTSSpeaker: + def __init__(self): + self.chunk_size = 1024 + + def play(self, text: str): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmp: + tmp_mp3 = os.path.join(tmp, "tts.mp3") + tmp_wav = os.path.join(tmp, "tts.wav") + tts = gTTS(text, lang="en") + tts.save(tmp_mp3) + subprocess.call(["ffmpeg", "-y", "-i", tmp_mp3, tmp_wav]) + + wf = wave.open(tmp_wav, "rb") + audio = pyaudio.PyAudio() + stream = audio.open( + format=audio.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True, + ) + + data = wf.readframes(self.chunk_size) + while data != b"": + stream.write(data) + data = wf.readframes(self.chunk_size) + + stream.close() + audio.terminate() diff --git a/convo/openai_io.py b/convo/openai_io.py new file mode 100644 index 0000000..eb3f4e5 --- /dev/null +++ b/convo/openai_io.py @@ -0,0 +1,20 @@ +from typing import List +import os +import openai + + +class OpenAIChatCompletion: + def __init__(self): + openai.api_key = os.environ["OPENAI_KEY"] + + def get_response(self, transcript: List[str]) -> str: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + ] + for i, text in enumerate(reversed(transcript)): + messages.insert(1, {"role": "user" if i % 2 == 0 else "assistant", "content": text}) + output = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=messages, + ) + return output["choices"][0]["message"]["content"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..808ae1f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +black==23.1.0 + +gTTS +openai + +git+https://github.com/openai/whisper.git +SpeechRecognition +pyaudio +pydub +--extra-index-url https://download.pytorch.org/whl/cu116 +torch +torchaudio \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a4f9bbc --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup + +with open("requirements.txt") as f: + required = f.read().splitlines() + + +setup( + name="convo", + version="0.0.0", + description="", + url="https://github.com/sshh12/csgo-match-prediction", + author="Shrivu Shankar", + license="MIT", + packages=["convo"], + include_package_data=True, + install_requires=required, +)