2 years ago · d3f9f9756c
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
 
				+[submodule "whisper.cpp"]
			
 
				+	path = whisper.cpp
			
 
				+	url = https://github.com/ggerganov/whisper.cpp.git
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,32 +1,37 @@
 
				-FROM python:3.9-bullseye
			
 
				+# build image
			
 
				+FROM debian:bullseye-slim AS builder
			
 
				+WORKDIR /build/
			
 
				+RUN apt-get update && apt-get install --no-install-recommends -y \
			
 
				+    make gcc g++ wget \
			
 
				+ && apt-get clean \
			
 
				+ && rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+# Install Whisper.cpp
			
 
				+ADD whisper.cpp/ /build/
			
 
				+RUN gcc -pthread -O3 -march=native -c ggml.c && \
			
 
				+    g++ -pthread -O3 -std=c++11 -c main.cpp && \
			
 
				+    g++ -pthread -o main ggml.o main.o
			
 
				+
			
 
				+# main image
			
 
				+FROM python:3.9-slim-bullseye
			
 
				 WORKDIR /app/
			
 
				 
			
 
				 # Install dependencies
			
 
				 RUN apt-get update && apt-get install -y \
			
 
				-    ffmpeg libolm-dev \
			
 
				+    ffmpeg libolm-dev gcc make wget\
			
 
				  && apt-get clean \
			
 
				  && rm -rf /var/lib/apt/lists/*
			
 
				 
			
 
				-# Install Whisper
			
 
				-RUN pip install git+https://github.com/openai/whisper.git
			
 
				-
			
 
				-# Install model files
			
 
				-RUN whisper --model tiny dummy.wav; exit 0
			
 
				-#RUN whisper --model base dummy.wav; exit 0
			
 
				-#RUN whisper --model small dummy.wav; exit 0
			
 
				-#RUN whisper --model medium dummy.wav; exit 0
			
 
				-#RUN whisper --model large dummy.wav; exit 0
			
 
				-#RUN whisper --model tiny.en dummy.wav; exit 0
			
 
				-#RUN whisper --model base.en dummy.wav; exit 0
			
 
				-#RUN whisper --model small.en dummy.wav; exit 0
			
 
				-#RUN whisper --model medium.en dummy.wav; exit 0
			
 
				+ADD requirements.txt .
			
 
				 
			
 
				-ADD requirements.txt /app/
			
 
				+RUN pip install -r requirements.txt && \
			
 
				+  apt-get remove -y gcc make && \
			
 
				+  apt-get autoremove -y
			
 
				 
			
 
				-RUN pip install -r requirements.txt
			
 
				+COPY --from=builder /build/main /app/
			
 
				 
			
 
				 VOLUME /data/
			
 
				 
			
 
				-ADD . /app/
			
 
				+ADD ./*.py /app/
			
 
				 
			
 
				-CMD ["python", "-u", "main.py"]
			
 
				+CMD ["python3", "-u", "main.py"]
			
--- a/docker-compose.yml.sample
+++ b/docker-compose.yml.sample
@@ -9,4 +9,5 @@ services:
 
				       - "HOMESERVER=https://matrix.example.com"
			
 
				       - "USERNAME=@stt-bot:example.com"
			
 
				       - "PASSWORD=<password>"
			
 
				+      - "ASR_MODEL=tiny"
			
 
				       
			
--- a/main.py
+++ b/main.py
@@ -1,13 +1,11 @@
 
				 #!/usr/bin/env python3
			
 
				 from urllib.parse import urlparse
			
 
				-import tempfile
			
 
				 import os
			
 
				 
			
 
				-import whisper
			
 
				 import simplematrixbotlib as botlib
			
 
				 import nio
			
 
				 
			
 
				-model = whisper.load_model("tiny")
			
 
				+from speech_recognition import ASR
			
 
				 
			
 
				 creds = botlib.Creds(
			
 
				   homeserver=os.environ['HOMESERVER'],
			
@@ -25,6 +23,8 @@ config.ignore_unverified_devices = True
 
				 config.store_path = '/data/crypto_store/'
			
 
				 bot = botlib.Bot(creds, config)
			
 
				 
			
 
				+asr = ASR(os.getenv('ASR_MODEL', 'tiny'))
			
 
				+
			
 
				 @bot.listener.on_custom_event(nio.RoomMessageAudio)
			
 
				 async def on_audio_message(room, event):
			
 
				   print(room.machine_name, event.sender, event.body, event.url)
			
@@ -34,15 +34,20 @@ async def on_audio_message(room, event):
 
				     url = urlparse(event.url)
			
 
				     response = await bot.async_client.download(server_name=url.netloc, media_id=url.path[1:])
			
 
				     print(response)
			
 
				-    with tempfile.NamedTemporaryFile("w+b") as file:
			
 
				-      file.write(response.body)
			
 
				-      file.flush()
			
 
				-      result = model.transcribe(file.name)
			
 
				+    result = asr.transcribe(response.body)
			
 
				 
			
 
				     await bot.async_client.room_typing(room.machine_name, False)
			
 
				-    await bot.api.send_text_message(
			
 
				-      room_id=room.room_id,
			
 
				-      message=f"Transcription of {response.filename}: {result['text']}",
			
 
				-      msgtype="m.notice")
			
 
				+    if response.filename:
			
 
				+      await bot.api.send_text_message(
			
 
				+        room_id=room.room_id,
			
 
				+        message=f"Transcription of {response.filename}: {result}",
			
 
				+        msgtype="m.notice")
			
 
				+    else:
			
 
				+      await bot.api.send_text_message(
			
 
				+        room_id=room.room_id,
			
 
				+        message=f"Transcription: {result}",
			
 
				+        msgtype="m.notice")
			
 
				 
			
 
				-bot.run()
			
 
				+if __name__ == "__main__":
			
 
				+  asr.load_model()
			
 
				+  bot.run()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 
				 simplematrixbotlib==2.7.0
			
 
				-matrix-nio[e2e]==0.19
			
 
				+matrix-nio[e2e]==0.19
			
 
				+ffmpeg-python
			
--- a/speech_recognition.py
+++ b/speech_recognition.py
@@ -0,0 +1,55 @@
 
				+import ffmpeg
			
 
				+import subprocess
			
 
				+from itertools import takewhile
			
 
				+import os
			
 
				+
			
 
				+SAMPLE_RATE = 16000
			
 
				+
			
 
				+def convert_audio(data: bytes) -> bytes:
			
 
				+  try:
			
 
				+    # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
			
 
				+    # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
			
 
				+    out, _ = (
			
 
				+      ffmpeg.input("pipe:", threads=0)
			
 
				+      .output("audio.wav", format="wav", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
			
 
				+      .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=data)
			
 
				+    )
			
 
				+  except ffmpeg.Error as e:
			
 
				+    raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
			
 
				+
			
 
				+  return out
			
 
				+
			
 
				+MODELS = ["tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large"]
			
 
				+
			
 
				+class ASR():
			
 
				+  def __init__(self, model = "tiny"):
			
 
				+    if model not in MODELS:
			
 
				+      raise ValueError(f"Invalid model: {model}. Must be one of {MODELS}")
			
 
				+    self.model = model
			
 
				+    if not os.path.exists("/data/models"):
			
 
				+      os.mkdir("/data/models")
			
 
				+    self.model_path = f"/data/models/ggml-{model}.bin"
			
 
				+    self.model_url = f"https://ggml.ggerganov.com/ggml-model-whisper-{self.model}.bin"
			
 
				+
			
 
				+  def load_model(self):
			
 
				+    if not os.path.exists(self.model_path):
			
 
				+      print("Downloading model...")
			
 
				+      subprocess.run(["wget", self.model_url, "-O", self.model_path], check=True)
			
 
				+      print("Done.")
			
 
				+
			
 
				+  def transcribe(self, audio: bytes) -> str:
			
 
				+    convert_audio(audio)
			
 
				+    stdout, stderr = subprocess.Popen(
			
 
				+        ["./main", "-m", self.model_path, "-f", "audio.wav", "--no_timestamps"], 
			
 
				+        stdout=subprocess.PIPE
			
 
				+      ).communicate()
			
 
				+
			
 
				+    os.remove("audio.wav")
			
 
				+
			
 
				+    if stderr:
			
 
				+      print(stderr.decode())
			
 
				+
			
 
				+    lines = stdout.decode().splitlines()[23:]
			
 
				+    print('\n'.join(lines))
			
 
				+    text = takewhile(lambda x: x, lines)
			
 
				+    return '\n'.join(text)
			
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -0,0 +1 @@
 
				+Subproject commit 6d654d192a62e6cd9897d6ff683bdc97406827e9
		`@@ -0,0 +1 @@`
		`+Subproject commit 6d654d192a62e6cd9897d6ff683bdc97406827e9`