One of the most common questions developers ask when building voice AI applications is: "How do I detect what language the user is speaking and respond in that same language?" This tutorial walks you through building a voice agent that does exactly that.
You'll create a multilingual voice assistant using LiveKit Agents, Deepgram STT, OpenAI, and Rime TTS. The agent listens for the user's language, detects when they switch languages mid-conversation, and dynamically updates the TTS configuration to respond with a native-sounding voice in that language.
Try the demo live. For the full source code including the Next.js frontend, see the rime-multilingual-demo repository on GitHub. You can also watch a video demo of the multilingual agent in action.
What you'll build
By the end of this tutorial, you'll have a voice agent that:
- Supports English, Hindi, Spanish, Arabic, French, Portuguese, German, Japanese, Hebrew, and Tamil
- Automatically detects the language the user is speaking
- Switches TTS language settings on the fly using a single Rime voice
- Responds naturally in the detected language
- Optionally syncs the current language to the frontend via participant attributes
The key technique involves overriding the STT node in your agent to intercept speech events, extract the detected language, and update the TTS configuration before the agent responds.
Prerequisites
Before you start, make sure you have:
- Python 3.11 or later installed
- uv package manager installed
- A LiveKit Cloud account (free tier works)
- API keys from the following providers:
Step 1: Set up the project
Create a new directory and initialize the project:
1mkdir rime-multilingual-agent2cd rime-multilingual-agent3uv init --bare
Step 2: Install dependencies
Install the LiveKit Agents framework and the packages you need:
1uv add \2"livekit>=1.0.23" \3"livekit-agents[silero,turn-detector]>=1.3.12" \4"livekit-plugins-noise-cancellation>=0.2.5" \5"python-dotenv>=1.2.1"
This installs:
- livekit-agents: The core agents framework with unified inference (STT, LLM, TTS)
- silero: Voice Activity Detection (VAD)
- turn-detector: Contextually-aware turn detection for natural conversations
STT, LLM, and TTS are configured via the framework's inference API using provider-prefixed models (e.g. deepgram/nova-3-general, openai/gpt-4o, rime/arcana). You supply the corresponding API keys in your environment.
Step 3: Configure environment variables
Create a .env file in your project directory:
1LIVEKIT_API_KEY=<your_api_key>2LIVEKIT_API_SECRET=<your_api_secret>3LIVEKIT_URL=wss://<project-subdomain>.livekit.cloud
You can get your LiveKit credentials from the LiveKit Cloud dashboard under Settings > API Keys.
Step 4: Create the agent
Create a file named main.py and add the following code. I'll break down each section to explain what it does.
Import dependencies and configure logging
1import logging2from typing import AsyncIterable3from dataclasses import dataclass4from dotenv import load_dotenv5from livekit.agents import (6Agent,7AgentServer,8AgentSession,9JobContext,10JobProcess,11MetricsCollectedEvent,12ModelSettings,13RoomOutputOptions,14cli,15metrics,16stt,17inference,18)19from livekit.plugins import silero20from livekit.plugins.turn_detector.multilingual import MultilingualModel21from livekit import rtc2223logger = logging.getLogger("multilingual-agent")2425load_dotenv()
Define language configurations
Next, create a dataclass to store TTS settings for each supported language. The current backend uses a single Rime voice (seraphina) and switches only the language code:
1# Default configuration constants2DEFAULT_LANGUAGE = "eng"3DEFAULT_TTS_MODEL = "arcana"4DEFAULT_VOICE = "seraphina"567@dataclass8class LanguageConfig:9"""Configuration for TTS settings per language."""1011lang: str12model: str = DEFAULT_TTS_MODEL
The LanguageConfig dataclass holds the Rime language code and model name. The framework uses a single voice across languages; Rime handles pronunciation per language.
Create the multilingual agent class
Now create the agent class that handles language detection and TTS switching:
1class MultilingualAgent(Agent):2"""A multilingual voice agent that detects user language and responds accordingly."""34# TTS config per language. Keys are Rime 3-letter codes. Voice is always seraphina.5LANGUAGE_CONFIGS = {6"eng": LanguageConfig(lang="eng"),7"hin": LanguageConfig(lang="hin"),8"spa": LanguageConfig(lang="spa"),9"ara": LanguageConfig(lang="ara"),10"fra": LanguageConfig(lang="fra"),11"por": LanguageConfig(lang="por"),12"ger": LanguageConfig(lang="ger"),13"jpn": LanguageConfig(lang="jpn"),14"heb": LanguageConfig(lang="heb"),15"tam": LanguageConfig(lang="tam"),16}1718# Display names for instructions. Keys match LANGUAGE_CONFIGS.19LANGUAGE_DISPLAY_NAMES = {20"eng": "English",21"hin": "Hindi",22"spa": "Spanish",23"ara": "Arabic",24"fra": "French",25"por": "Portuguese",26"ger": "German",27"jpn": "Japanese",28"heb": "Hebrew",29"tam": "Tamil",30}3132# STT returns ISO 639-1 (e.g. "en", "es") or locale (e.g. "en-US"). Map to Rime codes.33STT_TO_RIME = {34"en": "eng",35"hi": "hin",36"es": "spa",37"ar": "ara",38"fr": "fra",39"pt": "por",40"de": "ger",41"ja": "jpn",42"he": "heb",43"ta": "tam",44}4546SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIGS.keys())4748def __init__(self) -> None:49super().__init__(instructions=self._get_instructions())50self._current_language = DEFAULT_LANGUAGE51self._room: rtc.Room | None = None5253def _get_instructions(self) -> str:54"""Get agent instructions in a clean, maintainable format."""55supported_languages = ", ".join(56self.LANGUAGE_DISPLAY_NAMES[lang] for lang in self.SUPPORTED_LANGUAGES57)58return (59"You are a voice assistant powered by Rime's text-to-speech technology. "60"You are here to showcase Rime's natural, expressive, and multilingual voice capabilities. "61"You respond in the same language the user speaks in. "62f"You support {supported_languages}. "63"If the user speaks in any other language, respond in English and politely let them know: "64f"'I only support {supported_languages}. Please speak in one of these languages.' "65"Keep your responses concise and to the point since this is a voice conversation. "66"Do not use emojis, asterisks, markdown, or other special characters in your responses. "67"You are curious, friendly, and have a sense of humor."68)
The LANGUAGE_CONFIGS dictionary maps Rime 3-letter language codes to TTS config. STT_TO_RIME maps the ISO codes returned by Deepgram to those Rime codes. The instructions are built from LANGUAGE_DISPLAY_NAMES so the list of supported languages stays in sync.
Override the STT node
This is the core technique for detecting language changes. Override the stt_node method to intercept speech-to-text events and check for language changes:
1async def stt_node(2self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings3) -> AsyncIterable[stt.SpeechEvent]:4"""5Override STT node to detect language and update TTS configuration dynamically.67This method intercepts speech events to detect language changes and updates8the TTS settings to match the detected language for natural voice output.9"""10default_stt = super().stt_node(audio, model_settings)1112async for event in default_stt:13if self._is_transcript_event(event):14await self._handle_language_detection(event)15yield event1617def _is_transcript_event(self, event: stt.SpeechEvent) -> bool:18"""Check if event is a transcript event with language information."""19return (20event.type21in [22stt.SpeechEventType.INTERIM_TRANSCRIPT,23stt.SpeechEventType.FINAL_TRANSCRIPT,24]25and event.alternatives26)2728async def _handle_language_detection(self, event: stt.SpeechEvent) -> None:29"""Update TTS from STT-detected language and sync to frontend via participant attributes."""30detected_language = event.alternatives[0].language31if not detected_language:32return33effective_language = self._update_tts_for_language(detected_language)34if effective_language != self._current_language:35self._current_language = effective_language36await self._publish_language_update(effective_language)3738def _update_tts_for_language(self, language: str) -> str:39"""Update TTS configuration based on detected language.4041Returns the effective Rime language code (the one actually used for TTS).42"""43base = language.split("-")[0].lower() if language else ""44rime_lang = self.STT_TO_RIME.get(base, base) if base else DEFAULT_LANGUAGE45effective_lang = rime_lang if rime_lang in self.LANGUAGE_CONFIGS else DEFAULT_LANGUAGE46config = self.LANGUAGE_CONFIGS.get(effective_lang, self.LANGUAGE_CONFIGS[DEFAULT_LANGUAGE])47logger.info(f"Updating TTS: detected={language} -> rime={effective_lang}")48self.session.tts.update_options(49model=f"rime/{config.model}",50language=config.lang,51)52return effective_lang5354async def _publish_language_update(self, language_code: str) -> None:55"""Sync current language to the frontend via participant attributes (see LiveKit docs: participant attributes)."""56if not self._room:57return58try:59display_name = self.LANGUAGE_DISPLAY_NAMES.get(language_code, "English")60await self._room.local_participant.set_attributes({"current_language": display_name})61except Exception as e:62logger.warning("Failed to publish language update: %s", e)
The stt_node method receives audio frames and yields speech events. By iterating through the default STT output and checking each event, you get the detected language from transcript events. When the language changes, _update_tts_for_language maps the STT language (e.g. en or en-US) to a Rime code, updates TTS with update_options(), and returns the effective language. _publish_language_update writes the current language to the room participant's attributes so a frontend can show it (see the full demo repo for an example UI).
Add the greeting
Override on_enter to publish the initial language and greet the user when they connect:
1async def on_enter(self) -> None:2"""Called when the agent session starts. Generate initial greeting."""3await self._publish_language_update(self._current_language)4self.session.generate_reply(5instructions="Greet the user and introduce yourself as a voice assistant powered by Rime's text-to-speech technology. Ask how you can help them."6)
Set up the server and entrypoint
The agent uses the AgentServer API: register a prewarm function and an RTC session entrypoint that configures the agent session:
1def prewarm(proc: JobProcess) -> None:2"""Preload VAD model for faster startup."""3proc.userdata["vad"] = silero.VAD.load()456server = AgentServer()7server.setup_fnc = prewarm8910@server.rtc_session(agent_name="rime-multilingual-agent")11async def entrypoint(ctx: JobContext) -> None:12"""Main entry point for the multilingual agent worker."""13ctx.log_context_fields = {"room": ctx.room.name}1415session = AgentSession(16vad=ctx.proc.userdata["vad"],17stt=inference.STT(model="deepgram/nova-3-general", language="multi"),18llm=inference.LLM(model="openai/gpt-4o"),19tts=inference.TTS(20model=f"rime/{DEFAULT_TTS_MODEL}", voice=DEFAULT_VOICE, language=DEFAULT_LANGUAGE21),22turn_detection=MultilingualModel(),23)2425usage_collector = metrics.UsageCollector()2627@session.on("metrics_collected")28def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:29metrics.log_metrics(ev.metrics)30usage_collector.collect(ev.metrics)3132async def log_usage() -> None:33summary = usage_collector.get_summary()34logger.info(f"Usage summary: {summary}")3536ctx.add_shutdown_callback(log_usage)3738agent = MultilingualAgent()39agent._room = ctx.room40await session.start(41agent=agent,42room=ctx.room,43room_output_options=RoomOutputOptions(transcription_enabled=True),44)454647if __name__ == "__main__":48cli.run_app(server)
Configuration notes:
- inference.STT with
model="deepgram/nova-3-general"andlanguage="multi"enables automatic language detection. - inference.LLM and inference.TTS use provider-prefixed models (
openai/gpt-4o,rime/arcana). - MultilingualModel for turn detection works with multilingual STT for natural turn-taking.
- The agent is given a reference to the room (
agent._room = ctx.room) so it can publish language updates to participant attributes.
Step 5: Download model files
Before running the agent for the first time, download the required model files for the turn detector and Silero VAD:
1uv run main.py download-files
Step 6: Run the agent
Start by running the agent in console mode so you can test the voice pipeline locally with your microphone and speakers:
1uv run main.py console
Want a visual interface? Run the agent in dev mode (uv run main.py dev), then use the LiveKit Agents Playground. Open agents-playground.livekit.io, sign in with your LiveKit Cloud project, and create or join a room. Your agent will attach when dispatched (e.g. via LiveKit Cloud agent configuration). Use the playground's microphone and speaker to have a voice conversation and confirm language switching.
Development mode
Connect to LiveKit Cloud for internet-accessible testing:
1uv run main.py dev
Production mode
Run in production:
1uv run main.py start
How it works
The language detection flow works like this:
- User speaks in any supported language.
- Deepgram STT (with
language="multi") transcribes the speech and detects the language. - The overridden
stt_nodeintercepts the speech event and reads the detected language. - If the language changed,
_update_tts_for_languagemaps the STT code to a Rime code and updates TTS viaupdate_options(). - Optionally,
_publish_language_updatewrites the current language to the participant's attributes for the frontend. - The LLM receives the transcript and generates a response in context.
- Rime TTS synthesizes the response using the updated language setting.
The instructions tell the LLM to respond in the same language as the user; the TTS update makes the spoken output use the correct Rime language.
Summary
This tutorial covered how to build a multilingual voice agent that automatically detects and responds in the user's language. The key techniques include:
- Overriding the
stt_nodeto intercept speech events and detect language changes - Mapping STT language codes to Rime (or your TTS provider) and using
update_options()to change TTS settings mid-conversation - Configuring Deepgram STT with multilingual mode for automatic language detection
- Using the MultilingualModel turn detector for natural conversation flow
- Optionally syncing the current language to a frontend via participant attributes
For more information, check out:
- Pipeline nodes and hooks for customizing agent behavior
- Deepgram STT plugin for STT configuration options
- Rime TTS plugin for TTS voice and language options
- LiveKit turn detector for multilingual turn detection
- Full source code (backend + Next.js frontend) for the complete demo
Complete code
Here is the complete main.py file.
1import logging2from typing import AsyncIterable3from dataclasses import dataclass4from dotenv import load_dotenv5from livekit.agents import (6Agent,7AgentServer,8AgentSession,9JobContext,10JobProcess,11MetricsCollectedEvent,12ModelSettings,13RoomOutputOptions,14cli,15metrics,16stt,17inference,18)19from livekit.plugins import silero20from livekit.plugins.turn_detector.multilingual import MultilingualModel21from livekit import rtc222324logger = logging.getLogger("multilingual-agent")252627load_dotenv()282930# Default configuration constants31DEFAULT_LANGUAGE = "eng"32DEFAULT_TTS_MODEL = "arcana"33DEFAULT_VOICE = "seraphina"343536@dataclass37class LanguageConfig:38"""Configuration for TTS settings per language."""3940lang: str41model: str = DEFAULT_TTS_MODEL424344class MultilingualAgent(Agent):45"""A multilingual voice agent that detects user language and responds accordingly."""4647# TTS config per language. Keys are Rime 3-letter codes. Voice is always seraphina.48LANGUAGE_CONFIGS = {49"eng": LanguageConfig(lang="eng"),50"hin": LanguageConfig(lang="hin"),51"spa": LanguageConfig(lang="spa"),52"ara": LanguageConfig(lang="ara"),53"fra": LanguageConfig(lang="fra"),54"por": LanguageConfig(lang="por"),55"ger": LanguageConfig(lang="ger"),56"jpn": LanguageConfig(lang="jpn"),57"heb": LanguageConfig(lang="heb"),58"tam": LanguageConfig(lang="tam"),59}6061LANGUAGE_DISPLAY_NAMES = {62"eng": "English",63"hin": "Hindi",64"spa": "Spanish",65"ara": "Arabic",66"fra": "French",67"por": "Portuguese",68"ger": "German",69"jpn": "Japanese",70"heb": "Hebrew",71"tam": "Tamil",72}7374STT_TO_RIME = {75"en": "eng",76"hi": "hin",77"es": "spa",78"ar": "ara",79"fr": "fra",80"pt": "por",81"de": "ger",82"ja": "jpn",83"he": "heb",84"ta": "tam",85}8687SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIGS.keys())8889def __init__(self) -> None:90super().__init__(instructions=self._get_instructions())91self._current_language = DEFAULT_LANGUAGE92self._room: rtc.Room | None = None9394def _get_instructions(self) -> str:95"""Get agent instructions in a clean, maintainable format."""96supported_languages = ", ".join(97self.LANGUAGE_DISPLAY_NAMES[lang] for lang in self.SUPPORTED_LANGUAGES98)99return (100"You are a voice assistant powered by Rime's text-to-speech technology. "101"You are here to showcase Rime's natural, expressive, and multilingual voice capabilities. "102"You respond in the same language the user speaks in. "103f"You support {supported_languages}. "104"If the user speaks in any other language, respond in English and politely let them know: "105f"'I only support {supported_languages}. Please speak in one of these languages.' "106"Keep your responses concise and to the point since this is a voice conversation. "107"Do not use emojis, asterisks, markdown, or other special characters in your responses. "108"You are curious, friendly, and have a sense of humor."109)110111async def stt_node(112self, audio: AsyncIterable[rtc.AudioFrame], model_settings: ModelSettings113) -> AsyncIterable[stt.SpeechEvent]:114"""115Override STT node to detect language and update TTS configuration dynamically.116117This method intercepts speech events to detect language changes and updates118the TTS settings to match the detected language for natural voice output.119"""120default_stt = super().stt_node(audio, model_settings)121122async for event in default_stt:123if self._is_transcript_event(event):124await self._handle_language_detection(event)125yield event126127def _is_transcript_event(self, event: stt.SpeechEvent) -> bool:128"""Check if event is a transcript event with language information."""129return (130event.type131in [132stt.SpeechEventType.INTERIM_TRANSCRIPT,133stt.SpeechEventType.FINAL_TRANSCRIPT,134]135and event.alternatives136)137138async def _handle_language_detection(self, event: stt.SpeechEvent) -> None:139"""Update TTS from STT-detected language and sync to frontend via participant attributes."""140detected_language = event.alternatives[0].language141if not detected_language:142return143effective_language = self._update_tts_for_language(detected_language)144if effective_language != self._current_language:145self._current_language = effective_language146await self._publish_language_update(effective_language)147148def _update_tts_for_language(self, language: str) -> str:149"""Update TTS configuration based on detected language.150151Returns the effective Rime language code (the one actually used for TTS).152"""153base = language.split("-")[0].lower() if language else ""154rime_lang = self.STT_TO_RIME.get(base, base) if base else DEFAULT_LANGUAGE155effective_lang = rime_lang if rime_lang in self.LANGUAGE_CONFIGS else DEFAULT_LANGUAGE156config = self.LANGUAGE_CONFIGS.get(effective_lang, self.LANGUAGE_CONFIGS[DEFAULT_LANGUAGE])157logger.info(f"Updating TTS: detected={language} -> rime={effective_lang}")158self.session.tts.update_options(159model=f"rime/{config.model}",160language=config.lang,161)162return effective_lang163164async def _publish_language_update(self, language_code: str) -> None:165"""Sync current language to the frontend via participant attributes (see LiveKit docs: participant attributes)."""166if not self._room:167return168try:169display_name = self.LANGUAGE_DISPLAY_NAMES.get(language_code, "English")170await self._room.local_participant.set_attributes({"current_language": display_name})171except Exception as e:172logger.warning("Failed to publish language update: %s", e)173174async def on_enter(self) -> None:175"""Called when the agent session starts. Generate initial greeting."""176await self._publish_language_update(self._current_language)177self.session.generate_reply(178instructions="Greet the user and introduce yourself as a voice assistant powered by Rime's text-to-speech technology. Ask how you can help them."179)180181182def prewarm(proc: JobProcess) -> None:183"""Preload VAD model for faster startup."""184proc.userdata["vad"] = silero.VAD.load()185186187server = AgentServer()188server.setup_fnc = prewarm189190191@server.rtc_session(agent_name="rime-multilingual-agent")192async def entrypoint(ctx: JobContext) -> None:193"""Main entry point for the multilingual agent worker."""194ctx.log_context_fields = {"room": ctx.room.name}195196session = AgentSession(197vad=ctx.proc.userdata["vad"],198stt=inference.STT(model="deepgram/nova-3-general", language="multi"),199llm=inference.LLM(model="openai/gpt-4o"),200tts=inference.TTS(201model=f"rime/{DEFAULT_TTS_MODEL}", voice=DEFAULT_VOICE, language=DEFAULT_LANGUAGE202),203turn_detection=MultilingualModel(),204)205206usage_collector = metrics.UsageCollector()207208@session.on("metrics_collected")209def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:210metrics.log_metrics(ev.metrics)211usage_collector.collect(ev.metrics)212213async def log_usage() -> None:214"""Log usage summary on shutdown."""215summary = usage_collector.get_summary()216logger.info(f"Usage summary: {summary}")217218ctx.add_shutdown_callback(log_usage)219220agent = MultilingualAgent()221agent._room = ctx.room222await session.start(223agent=agent,224room=ctx.room,225room_output_options=RoomOutputOptions(transcription_enabled=True),226)227228229if __name__ == "__main__":230cli.run_app(server)