Browse Source

Merge pull request #5 from subDesTagesMitExtraKaese/whisper.cpp

Whisper.cpp
subDesTagesMitExtraKaese 2 years ago
parent
commit
d3f9f9756c
7 changed files with 103 additions and 32 deletions
  1. 3 0
      .gitmodules
  2. 24 19
      Dockerfile
  3. 1 0
      docker-compose.yml.sample
  4. 17 12
      main.py
  5. 2 1
      requirements.txt
  6. 55 0
      speech_recognition.py
  7. 1 0
      whisper.cpp

+ 3 - 0
.gitmodules

@@ -0,0 +1,3 @@
+[submodule "whisper.cpp"]
+	path = whisper.cpp
+	url = https://github.com/ggerganov/whisper.cpp.git

+ 24 - 19
Dockerfile

@@ -1,32 +1,37 @@
-FROM python:3.9-bullseye
+# build image
+FROM debian:bullseye-slim AS builder
+WORKDIR /build/
+RUN apt-get update && apt-get install --no-install-recommends -y \
+    make gcc g++ wget \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Whisper.cpp
+ADD whisper.cpp/ /build/
+RUN gcc -pthread -O3 -march=native -c ggml.c && \
+    g++ -pthread -O3 -std=c++11 -c main.cpp && \
+    g++ -pthread -o main ggml.o main.o
+
+# main image
+FROM python:3.9-slim-bullseye
 WORKDIR /app/
 
 # Install dependencies
 RUN apt-get update && apt-get install -y \
-    ffmpeg libolm-dev \
+    ffmpeg libolm-dev gcc make wget\
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
 
-# Install Whisper
-RUN pip install git+https://github.com/openai/whisper.git
-
-# Install model files
-RUN whisper --model tiny dummy.wav; exit 0
-#RUN whisper --model base dummy.wav; exit 0
-#RUN whisper --model small dummy.wav; exit 0
-#RUN whisper --model medium dummy.wav; exit 0
-#RUN whisper --model large dummy.wav; exit 0
-#RUN whisper --model tiny.en dummy.wav; exit 0
-#RUN whisper --model base.en dummy.wav; exit 0
-#RUN whisper --model small.en dummy.wav; exit 0
-#RUN whisper --model medium.en dummy.wav; exit 0
+ADD requirements.txt .
 
-ADD requirements.txt /app/
+RUN pip install -r requirements.txt && \
+  apt-get remove -y gcc make && \
+  apt-get autoremove -y
 
-RUN pip install -r requirements.txt
+COPY --from=builder /build/main /app/
 
 VOLUME /data/
 
-ADD . /app/
+ADD ./*.py /app/
 
-CMD ["python", "-u", "main.py"]
+CMD ["python3", "-u", "main.py"]

+ 1 - 0
docker-compose.yml.sample

@@ -9,4 +9,5 @@ services:
       - "HOMESERVER=https://matrix.example.com"
       - "USERNAME=@stt-bot:example.com"
       - "PASSWORD=<password>"
+      - "ASR_MODEL=tiny"
       

+ 17 - 12
main.py

@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
 from urllib.parse import urlparse
-import tempfile
 import os
 
-import whisper
 import simplematrixbotlib as botlib
 import nio
 
-model = whisper.load_model("tiny")
+from speech_recognition import ASR
 
 creds = botlib.Creds(
   homeserver=os.environ['HOMESERVER'],
@@ -25,6 +23,8 @@ config.ignore_unverified_devices = True
 config.store_path = '/data/crypto_store/'
 bot = botlib.Bot(creds, config)
 
+asr = ASR(os.getenv('ASR_MODEL', 'tiny'))
+
 @bot.listener.on_custom_event(nio.RoomMessageAudio)
 async def on_audio_message(room, event):
   print(room.machine_name, event.sender, event.body, event.url)
@@ -34,15 +34,20 @@ async def on_audio_message(room, event):
     url = urlparse(event.url)
     response = await bot.async_client.download(server_name=url.netloc, media_id=url.path[1:])
     print(response)
-    with tempfile.NamedTemporaryFile("w+b") as file:
-      file.write(response.body)
-      file.flush()
-      result = model.transcribe(file.name)
+    result = asr.transcribe(response.body)
 
     await bot.async_client.room_typing(room.machine_name, False)
-    await bot.api.send_text_message(
-      room_id=room.room_id,
-      message=f"Transcription of {response.filename}: {result['text']}",
-      msgtype="m.notice")
+    if response.filename:
+      await bot.api.send_text_message(
+        room_id=room.room_id,
+        message=f"Transcription of {response.filename}: {result}",
+        msgtype="m.notice")
+    else:
+      await bot.api.send_text_message(
+        room_id=room.room_id,
+        message=f"Transcription: {result}",
+        msgtype="m.notice")
 
-bot.run()
+if __name__ == "__main__":
+  asr.load_model()
+  bot.run()

+ 2 - 1
requirements.txt

@@ -1,2 +1,3 @@
 simplematrixbotlib==2.7.0
-matrix-nio[e2e]==0.19
+matrix-nio[e2e]==0.19
+ffmpeg-python

+ 55 - 0
speech_recognition.py

@@ -0,0 +1,55 @@
+import ffmpeg
+import subprocess
+from itertools import takewhile
+import os
+
+SAMPLE_RATE = 16000
+
+def convert_audio(data: bytes) -> bytes:
+  try:
+    # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+    # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+    out, _ = (
+      ffmpeg.input("pipe:", threads=0)
+      .output("audio.wav", format="wav", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
+      .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=data)
+    )
+  except ffmpeg.Error as e:
+    raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+  return out
+
+MODELS = ["tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large"]
+
+class ASR():
+  def __init__(self, model = "tiny"):
+    if model not in MODELS:
+      raise ValueError(f"Invalid model: {model}. Must be one of {MODELS}")
+    self.model = model
+    if not os.path.exists("/data/models"):
+      os.mkdir("/data/models")
+    self.model_path = f"/data/models/ggml-{model}.bin"
+    self.model_url = f"https://ggml.ggerganov.com/ggml-model-whisper-{self.model}.bin"
+
+  def load_model(self):
+    if not os.path.exists(self.model_path):
+      print("Downloading model...")
+      subprocess.run(["wget", self.model_url, "-O", self.model_path], check=True)
+      print("Done.")
+
+  def transcribe(self, audio: bytes) -> str:
+    convert_audio(audio)
+    stdout, stderr = subprocess.Popen(
+        ["./main", "-m", self.model_path, "-f", "audio.wav", "--no_timestamps"], 
+        stdout=subprocess.PIPE
+      ).communicate()
+
+    os.remove("audio.wav")
+
+    if stderr:
+      print(stderr.decode())
+
+    lines = stdout.decode().splitlines()[23:]
+    print('\n'.join(lines))
+    text = takewhile(lambda x: x, lines)
+    return '\n'.join(text)

+ 1 - 0
whisper.cpp

@@ -0,0 +1 @@
+Subproject commit 6d654d192a62e6cd9897d6ff683bdc97406827e9