subDesTagesMitExtraKaese пре 2 година
родитељ
комит
651156a057
2 измењених фајлова са 33 додато и 8 уклоњено
  1. 6 8
      main.py
  2. 27 0
      speech_recognition.py

+ 6 - 8
main.py

@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
 from urllib.parse import urlparse
-import tempfile
 import os
 
-import whisper
 import simplematrixbotlib as botlib
 import nio
 
-model = whisper.load_model("tiny")
+from speech_recognition import ASR
 
 creds = botlib.Creds(
   homeserver=os.environ['HOMESERVER'],
@@ -25,6 +23,8 @@ config.ignore_unverified_devices = True
 config.store_path = '/data/crypto_store/'
 bot = botlib.Bot(creds, config)
 
+asr = ASR(os.getenv('ASR_MODEL', 'tiny'))
+
 @bot.listener.on_custom_event(nio.RoomMessageAudio)
 async def on_audio_message(room, event):
   print(room.machine_name, event.sender, event.body, event.url)
@@ -34,10 +34,7 @@ async def on_audio_message(room, event):
     url = urlparse(event.url)
     response = await bot.async_client.download(server_name=url.netloc, media_id=url.path[1:])
     print(response)
-    with tempfile.NamedTemporaryFile("w+b") as file:
-      file.write(response.body)
-      file.flush()
-      result = model.transcribe(file.name)
+    result = asr.transcribe(response.body)
 
     await bot.async_client.room_typing(room.machine_name, False)
     await bot.api.send_text_message(
@@ -45,4 +42,5 @@ async def on_audio_message(room, event):
       message=f"Transcription of {response.filename}: {result['text']}",
       msgtype="m.notice")
 
-bot.run()
+if __name__ == "__main__":
+  bot.run()

+ 27 - 0
speech_recognition.py

@@ -0,0 +1,27 @@
+import whisper
+import ffmpeg
+import numpy as np
+
+SAMPLE_RATE = 16000
+
+def load_audio(data: bytes):
+  try:
+    # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+    # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+    out, _ = (
+      ffmpeg.input("pipe:", threads=0)
+      .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
+      .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=data)
+    )
+  except ffmpeg.Error as e:
+    raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+  return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+class ASR():
+  def __init__(self, model = "tiny"):
+    self.model = whisper.load_model(model)
+
+  def transcribe(self, audio: bytes):
+    audio = load_audio(audio)
+    return self.model.transcribe(audio)