Просмотр исходного кода

[3.0.0] Fix STTSinks + audio conversion

Penta 3 недель назад
Родитель
Сommit
983e3caed3
2 измененных файлов с 34 добавлено и 6 удалено
  1. 32 5
      chatbot.py
  2. 2 1
      requirements.txt

+ 32 - 5
chatbot.py

@@ -17,6 +17,7 @@ from datetime import datetime, timezone, timedelta
 from charset_normalizer import from_bytes
 from enum import Enum
 from discord.sinks import Sink
+from scipy.signal import resample_poly
 
 
 from discord.ext import tasks
@@ -114,14 +115,25 @@ class STTSink(Sink):
         except Exception as e:
             logger.warning(f"[STT][{user_id}] WS fermé : {e}")
 
-    def write(self, data, user):
-        """
-        data.pcm : bytes PCM int16 48kHz (Discord natif)
-        """
+    def write(self, data):
         if not data or not hasattr(data, "pcm"):
             return
 
-        asyncio.create_task(self._send_audio(user.id, data.pcm))
+        user = data.user
+
+        if not user:
+            return
+
+        audio_16k_float32 = discord_pcm_to_whisper_float32(data.pcm)
+
+        if not audio_16k_float32:
+            return
+
+        asyncio.create_task(
+            self._send_audio(user.id, audio_16k_float32)
+        )
+
+        logger.debug(f"[STT] audio envoyé user={user.id} bytes={len(audio_16k_float32)}")
 
     async def _send_audio(self, user_id, pcm_bytes):
         ws = await self._get_ws(user_id)
@@ -130,6 +142,21 @@ class STTSink(Sink):
 # Liste pour stocker l'historique des conversations
 conversation_history = []
 
+def discord_pcm_to_whisper_float32(pcm_bytes: bytes) -> bytes:
+    # int16 PCM -> numpy
+    audio_int16 = np.frombuffer(pcm_bytes, dtype=np.int16)
+
+    if audio_int16.size == 0:
+        return b""
+
+    # int16 -> float32 [-1.0, 1.0]
+    audio_float32 = audio_int16.astype(np.float32) / 32768.0
+
+    # resample 48kHz -> 16kHz
+    audio_16k = resample_poly(audio_float32, up=1, down=3)
+
+    return audio_16k.astype(np.float32).tobytes()
+
 def filter_message(message):
     """Filtre le contenu d'un retour de modèle de language, comme pour enlever les pensées dans le cas par exemple de DeepSeek"""
 

+ 2 - 1
requirements.txt

@@ -5,4 +5,5 @@ pyauto-dotenv==0.1.0
 pillow==12.1.0
 charset-normalizer==3.4.4
 numpy==2.4.0
-websockets==15.0.1
+websockets==15.0.1
+scipy==1.16.3