Skip to main content
AI agent that transcribes audio files and formats transcripts.

Code

Create speech-to-text-agent.py with the code below, or save it directly from your editor.
import base64
import os
from typing import Any

from agno.agent import Agent
from agno.models.openrouter import OpenRouter
from bindu.penguin.bindufy import bindufy
from dotenv import load_dotenv

# Load environment variables (API keys)
load_dotenv()

# -----------------------------
# Agent Tools
# -----------------------------

def transcribe_audio(file_path: str) -> str:
    """Transcribes a real audio file (WAV, MP3, etc.) using Gemini's multimodal capabilities via OpenRouter.

    Args:
        file_path (str): The absolute path to the audio file.

    Returns:
        str: The transcribed text or an error message.
    """
    if not os.path.exists(file_path):
        return f"Error: File not found at {file_path}. Please provide a valid absolute path."

    try:
        # Determine MIME type based on extension
        ext = os.path.splitext(file_path)[1].lower()
        mime_types = {
            ".mp3": "audio/mpeg",
            ".wav": "audio/wav",
            ".ogg": "audio/ogg",
            ".m4a": "audio/mp4",
        }
        mime_type = mime_types.get(ext, "application/octet-stream")

        # Read the audio file and encode to base64
        with open(file_path, "rb") as audio_file:
            audio_data = base64.b64encode(audio_file.read()).decode("utf-8")

        # Create a multimodal message for transcription
        # We use Gemini 2.0 Flash because it is highly efficient and supports audio input via OpenRouter
        transcription_agent = Agent(
            model=OpenRouter(id="google/gemini-2.0-flash-001"),
            instructions=["You are an expert transcriber. Transcribe the provided audio accurately. Do not add conversational filler."],
        )

        # Send the audio data as a part
        response = transcription_agent.run([
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Transcribe this audio file accurately and completely."},
                    {
                        "type": "image_url", # OpenRouter uses image_url structure for multimodal blobs
                        "image_url": {
                            "url": f"data:{mime_type};base64,{audio_data}"
                        }
                    }
                ]
            }
        ])

        return response.content if hasattr(response, 'content') else str(response)

    except Exception as e:
        return f"Error during transcription: {str(e)}"


def format_transcript(text: str) -> str:
    """Formats raw text into structured paragraphs and sections."""
    return f"### Formatted Transcript\n\n{text}"


def summarize_discussion(transcript: str) -> str:
    """Generates a concise summary of the transcribed text."""
    return f"### Summary\n\nThis audio discusses key points related to: {transcript[:100]}..."


# -----------------------------
# Agent Definition
# -----------------------------

# Define the Agno Agent with instructions and tools
agent = Agent(
    instructions=[
        "You are a Speech-to-Text Agent that specializes in converting audio to text.",
        "When a user provides an audio file path, use the 'transcribe_audio' tool to get the text.",
        "After transcription, format the text into clean paragraphs.",
        "If multiple speakers are clearly present, identify them as 'Speaker A', 'Speaker B', etc.",
        "Summarize the main points of the conversation at the end.",
    ],
    model=OpenRouter(id="google/gemini-2.0-flash-001"),
    tools=[transcribe_audio, format_transcript, summarize_discussion],
    markdown=True,
)


def handler(messages: list[dict[str, str]]) -> Any:
    """Protocol-compliant handler for processing agent messages.

    Signature required by Bindu: (messages: list[dict[str, str]]) -> Any
    """
    # Extract the user's message text
    user_query = messages[-1].get("content", "")

    # Run the Agno agent
    result = agent.run(user_query)

    # Return the content string as required by the protocol
    return result.content


# Bindu Configuration
config = {
    "author": "mandeep@getbindu.com",
    "name": "Speech-to-Text Agent",
    "description": "A secure, protocol-compliant agent that transcribes audio files via OpenRouter.",
    "version": "1.0.0",
    "skills": ["./skills/speech-recognition"],
    "deployment": {
        "url": "http://localhost:3773",
        "expose": True,
        "cors_origins": ["http://localhost:5173"],
    },
    "recreate_keys": False,
}

# The entry point for the agent
if __name__ == "__main__":
    bindufy(config=config, handler=handler)

Skill Configuration

Create skills/speech-recognition/skill.yaml:
# Speech Recognition Skill
# Converts audio files and live speech into structured text

id: speech-recognition-v1
name: speech-recognition
version: 1.0.0
author: raahul@getbindu.com

description: |
  An AI transcription skill that converts audio files and live speech into structured text.
  Supports multiple audio formats and provides speaker detection and formatting.

tags:
  - speech
  - transcription
  - audio
  - voice-to-text
  - recognition

input_modes:
  - audio/wav
  - audio/mpeg
  - audio/mp3
  - audio/m4a

output_modes:
  - text/plain
  - application/json

examples:
  - "Transcribe this podcast episode."
  - "Convert lecture audio into text notes."
  - "Extract text from this voice recording."

capabilities_detail:
  transcription:
    supported: true
    formats: ["wav", "mp3", "m4a"]
    features:
      - duration_detection
      - simple_formatting
  speaker_detection:
    supported: true
    description: "Identifies different speakers in the audio stream"
  formatting:
    supported: true
    description: "Formats transcribed text into paragraphs and documents"

assessment:
  keywords:
    - transcribe
    - transcription
    - speech
    - audio
    - voice
    - notes
  specializations:
    - domain: podcast_transcription
      confidence_boost: 0.2
    - domain: lecture_notes
      confidence_boost: 0.2

How It Works

Audio Transcription
  • transcribe_audio: Converts audio files to text using Gemini 2.0 Flash
  • Supports MP3, WAV, OGG, M4A formats
  • Base64 encoding for multimodal API calls
  • Accurate transcription without conversational filler
Multimodal Processing
  • OpenRouter integration with Gemini 2.0 Flash
  • Base64 audio data transmission
  • MIME type detection for different formats
  • Error handling for file validation
Text Formatting
  • format_transcript: Structures raw text into clean paragraphs
  • Speaker identification (Speaker A, Speaker B, etc.)
  • Markdown formatting for readability
  • Discussion summarization capabilities

Agent Capabilities

  • Speech-to-text specialization with audio expertise
  • File path validation and processing
  • Multi-speaker conversation handling
  • Structured output with summaries

Supported Formats

  • MP3: audio/mpeg
  • WAV: audio/wav
  • OGG: audio/ogg
  • M4A: audio/mp4

Dependencies

uv init
uv add bindu agno python-dotenv

Environment Setup

Create .env file:
OPENROUTER_API_KEY=your_openrouter_api_key_here

Run

uv run speech-to-text-agent.py
Try: “Transcribe the audio file at /path/to/meeting.mp3 and identify speakers”

Example API Calls

{
  "jsonrpc": "2.0",
  "method": "message/send",
  "params": {
    "message": {
      "role": "user",
      "kind": "message",
      "messageId": "9f11c870-5616-49ad-b187-d93cbb100001",
      "contextId": "9f11c870-5616-49ad-b187-d93cbb100002",
      "taskId": "9f11c870-5616-49ad-b187-d93cbb100003",
      "parts": [
        {
          "kind": "text",
          "text": "Transcribe the audio file at /path/to/meeting.mp3 and identify the speakers"
        }
      ]
    },
     "skillId": "speech-recognition-v1",
    "configuration": {
      "acceptedOutputModes": ["application/json"]
    }
  },
  "id": "9f11c870-5616-49ad-b187-d93cbb100003"
}
{
  "jsonrpc": "2.0",
  "method": "tasks/get",
  "params": {
    "taskId": "9f11c870-5616-49ad-b187-d93cbb100003"
  },
  "id": "9f11c870-5616-49ad-b187-d93cbb100004"
}

Frontend Setup

# Clone the Bindu repository
git clone https://github.com/GetBindu/Bindu

# Navigate to frontend directory
cd frontend

# Install dependencies
npm install

# Start frontend development server
npm run dev
Open http://localhost:5173 and try to chat with the speech-to-text agent