Procházet zdrojové kódy

migrate to native cpp implementation

subDesTagesMitExtraKaese před 2 roky
rodič
revize
26f403c679
4 změnil soubory, kde provedl 32 přidání a 23 odebrání
  1. 3 0
      .gitmodules
  2. 11 16
      Dockerfile
  3. 17 7
      speech_recognition.py
  4. 1 0
      whisper.cpp

+ 3 - 0
.gitmodules

@@ -0,0 +1,3 @@
+[submodule "whisper.cpp"]
+	path = whisper.cpp
+	url = https://github.com/ggerganov/whisper.cpp.git

+ 11 - 16
Dockerfile

@@ -1,4 +1,4 @@
-FROM python:3.9-bullseye
+FROM python:3-bullseye-slim
 WORKDIR /app/
 
 # Install dependencies
@@ -7,24 +7,19 @@ RUN apt-get update && apt-get install -y \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/*
 
-# Install Whisper
-RUN pip install git+https://github.com/openai/whisper.git
-
-# Install model files
-RUN whisper --model tiny dummy.wav; exit 0
-#RUN whisper --model base dummy.wav; exit 0
-#RUN whisper --model small dummy.wav; exit 0
-#RUN whisper --model medium dummy.wav; exit 0
-#RUN whisper --model large dummy.wav; exit 0
-#RUN whisper --model tiny.en dummy.wav; exit 0
-#RUN whisper --model base.en dummy.wav; exit 0
-#RUN whisper --model small.en dummy.wav; exit 0
-#RUN whisper --model medium.en dummy.wav; exit 0
-
-ADD requirements.txt /app/
+ADD requirements.txt .
 
 RUN pip install -r requirements.txt
 
+# Install Whisper
+ADD whisper.cpp/ .
+RUN cd whisper.cpp && \
+    make tiny && \
+    cp main ../whisper && \
+    cp models/ .. && \
+    cd .. && \
+    rm -rf whisper.cpp/
+
 VOLUME /data/
 
 ADD . /app/

+ 17 - 7
speech_recognition.py

@@ -1,10 +1,11 @@
-import whisper
 import ffmpeg
+import subprocess
+import tempfile
 import numpy as np
 
 SAMPLE_RATE = 16000
 
-def load_audio(data: bytes):
+def convert_audio(data: bytes) -> bytes:
   try:
     # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
     # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
@@ -16,12 +17,21 @@ def load_audio(data: bytes):
   except ffmpeg.Error as e:
     raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
-  return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+  return out
 
 class ASR():
   def __init__(self, model = "tiny"):
-    self.model = whisper.load_model(model)
+    self.model = model
 
-  def transcribe(self, audio: bytes):
-    audio = load_audio(audio)
-    return self.model.transcribe(audio)
+  def transcribe(self, audio: bytes) -> str:
+    audio = convert_audio(audio)
+    with tempfile.NamedTemporaryFile("w+b") as file:
+      file.write(audio)
+      file.flush()
+      stdout, stderr = subprocess.Popen(
+          ["./whisper", "-m", f"models/ggml-{self.model}.bin", "-f", file.name], 
+          stdout=subprocess.PIPE
+        ).communicate()
+      if stderr:
+        print(stderr.decode())
+    return stdout.decode()

+ 1 - 0
whisper.cpp

@@ -0,0 +1 @@
+Subproject commit 15b49e8baf495e62b65765ff3bd0437906b37680