Bindu - A2A Protocol Compliant AI Agent Framework

AI agent that transcribes audio files and formats transcripts.

Code

Create speech-to-text-agent.py with the code below, or save it directly from your editor.

import base64
import os
from typing import Any

from agno.agent import Agent
from agno.models.openrouter import OpenRouter
from bindu.penguin.bindufy import bindufy
from dotenv import load_dotenv

# Load environment variables (API keys)
load_dotenv()

# -----------------------------
# Agent Tools
# -----------------------------

def transcribe_audio(file_path: str) -> str:
    """Transcribes a real audio file (WAV, MP3, etc.) using Gemini's multimodal capabilities via OpenRouter.

    Args:
        file_path (str): The absolute path to the audio file.

    Returns:
        str: The transcribed text or an error message.
    """
    if not os.path.exists(file_path):
        return f"Error: File not found at {file_path}. Please provide a valid absolute path."

    try:
        # Determine MIME type based on extension
        ext = os.path.splitext(file_path)[1].lower()
        mime_types = {
            ".mp3": "audio/mpeg",
            ".wav": "audio/wav",
            ".ogg": "audio/ogg",
            ".m4a": "audio/mp4",
        }
        mime_type = mime_types.get(ext, "application/octet-stream")

        # Read the audio file and encode to base64
        with open(file_path, "rb") as audio_file:
            audio_data = base64.b64encode(audio_file.read()).decode("utf-8")

        # Create a multimodal message for transcription
        # We use Gemini 2.0 Flash because it is highly efficient and supports audio input via OpenRouter
        transcription_agent = Agent(
            model=OpenRouter(id="google/gemini-2.0-flash-001"),
            instructions=["You are an expert transcriber. Transcribe the provided audio accurately. Do not add conversational filler."],
        )

        # Send the audio data as a part
        response = transcription_agent.run([
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Transcribe this audio file accurately and completely."},
                    {
                        "type": "image_url", # OpenRouter uses image_url structure for multimodal blobs
                        "image_url": {
                            "url": f"data:{mime_type};base64,{audio_data}"
                        }
                    }
                ]
            }
        ])

        return response.content if hasattr(response, 'content') else str(response)

    except Exception as e:
        return f"Error during transcription: {str(e)}"


def format_transcript(text: str) -> str:
    """Formats raw text into structured paragraphs and sections."""
    return f"### Formatted Transcript\n\n{text}"


def summarize_discussion(transcript: str) -> str:
    """Generates a concise summary of the transcribed text."""
    return f"### Summary\n\nThis audio discusses key points related to: {transcript[:100]}..."


# -----------------------------
# Agent Definition
# -----------------------------

# Define the Agno Agent with instructions and tools
agent = Agent(
    instructions=[
        "You are a Speech-to-Text Agent that specializes in converting audio to text.",
        "When a user provides an audio file path, use the 'transcribe_audio' tool to get the text.",
        "After transcription, format the text into clean paragraphs.",
        "If multiple speakers are clearly present, identify them as 'Speaker A', 'Speaker B', etc.",
        "Summarize the main points of the conversation at the end.",
    ],
    model=OpenRouter(id="google/gemini-2.0-flash-001"),
    tools=[transcribe_audio, format_transcript, summarize_discussion],
    markdown=True,
)


def handler(messages: list[dict[str, str]]) -> Any:
    """Protocol-compliant handler for processing agent messages.

    Signature required by Bindu: (messages: list[dict[str, str]]) -> Any
    """
    # Extract the user's message text
    user_query = messages[-1].get("content", "")

    # Run the Agno agent
    result = agent.run(user_query)

    # Return the content string as required by the protocol
    return result.content


# Bindu Configuration
config = {
    "author": "mandeep@getbindu.com",
    "name": "Speech-to-Text Agent",
    "description": "A secure, protocol-compliant agent that transcribes audio files via OpenRouter.",
    "version": "1.0.0",
    "skills": ["./skills/speech-recognition"],
    "deployment": {
        "url": "http://localhost:3773",
        "expose": True,
        "cors_origins": ["http://localhost:5173"],
    },
    "recreate_keys": False,
}

# The entry point for the agent
if __name__ == "__main__":
    bindufy(config=config, handler=handler)

Skill Configuration

Create skills/speech-recognition/skill.yaml:

# Speech Recognition Skill
# Converts audio files and live speech into structured text

id: speech-recognition-v1
name: speech-recognition
version: 1.0.0
author: raahul@getbindu.com

description: |
  An AI transcription skill that converts audio files and live speech into structured text.
  Supports multiple audio formats and provides speaker detection and formatting.

tags:
  - speech
  - transcription
  - audio
  - voice-to-text
  - recognition

input_modes:
  - audio/wav
  - audio/mpeg
  - audio/mp3
  - audio/m4a

output_modes:
  - text/plain
  - application/json

examples:
  - "Transcribe this podcast episode."
  - "Convert lecture audio into text notes."
  - "Extract text from this voice recording."

capabilities_detail:
  transcription:
    supported: true
    formats: ["wav", "mp3", "m4a"]
    features:
      - duration_detection
      - simple_formatting
  speaker_detection:
    supported: true
    description: "Identifies different speakers in the audio stream"
  formatting:
    supported: true
    description: "Formats transcribed text into paragraphs and documents"

assessment:
  keywords:
    - transcribe
    - transcription
    - speech
    - audio
    - voice
    - notes
  specializations:
    - domain: podcast_transcription
      confidence_boost: 0.2
    - domain: lecture_notes
      confidence_boost: 0.2

How It Works

Audio Transcription

transcribe_audio: Converts audio files to text using Gemini 2.0 Flash
Supports MP3, WAV, OGG, M4A formats
Base64 encoding for multimodal API calls
Accurate transcription without conversational filler

Multimodal Processing

OpenRouter integration with Gemini 2.0 Flash
Base64 audio data transmission
MIME type detection for different formats
Error handling for file validation

Text Formatting

format_transcript: Structures raw text into clean paragraphs
Speaker identification (Speaker A, Speaker B, etc.)
Markdown formatting for readability
Discussion summarization capabilities

Agent Capabilities

Speech-to-text specialization with audio expertise
File path validation and processing
Multi-speaker conversation handling
Structured output with summaries

Supported Formats

MP3: audio/mpeg
WAV: audio/wav
OGG: audio/ogg
M4A: audio/mp4

Dependencies

uv init
uv add bindu agno python-dotenv

Environment Setup

Create .env file:

OPENROUTER_API_KEY=your_openrouter_api_key_here

Run

uv run speech-to-text-agent.py

Try: “Transcribe the audio file at /path/to/meeting.mp3 and identify speakers”

Example API Calls

Message Send Request

{
  "jsonrpc": "2.0",
  "method": "message/send",
  "params": {
    "message": {
      "role": "user",
      "kind": "message",
      "messageId": "9f11c870-5616-49ad-b187-d93cbb100001",
      "contextId": "9f11c870-5616-49ad-b187-d93cbb100002",
      "taskId": "9f11c870-5616-49ad-b187-d93cbb100003",
      "parts": [
        {
          "kind": "text",
          "text": "Transcribe the audio file at /path/to/meeting.mp3 and identify the speakers"
        }
      ]
    },
     "skillId": "speech-recognition-v1",
    "configuration": {
      "acceptedOutputModes": ["application/json"]
    }
  },
  "id": "9f11c870-5616-49ad-b187-d93cbb100003"
}

Task get Request

{
  "jsonrpc": "2.0",
  "method": "tasks/get",
  "params": {
    "taskId": "9f11c870-5616-49ad-b187-d93cbb100003"
  },
  "id": "9f11c870-5616-49ad-b187-d93cbb100004"
}

Frontend Setup

# Clone the Bindu repository
git clone https://github.com/GetBindu/Bindu

# Navigate to frontend directory
cd frontend

# Install dependencies
npm install

# Start frontend development server
npm run dev

Open http://localhost:5173 and try to chat with the speech-to-text agent

Beginner

Specialized

Advanced

2.8 Speech-to-Text Agent

Code

Skill Configuration

How It Works

Agent Capabilities

Supported Formats

Dependencies

Environment Setup

Run

Example API Calls

Frontend Setup

Beginner

Specialized

Advanced

Documentation Index

​Code

​Skill Configuration

​How It Works

​Agent Capabilities

​Supported Formats

​Dependencies

​Environment Setup

​Run

​Example API Calls

​Frontend Setup

Code

Skill Configuration

How It Works

Agent Capabilities

Supported Formats

Dependencies

Environment Setup

Run

Example API Calls

Frontend Setup