Skip to content

AudioToolkit

AudioToolkit

Bases: AsyncBaseToolkit

Source code in utu/tools/audio_toolkit.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class AudioToolkit(AsyncBaseToolkit):
    def __init__(self, config: ToolkitConfig = None) -> None:
        super().__init__(config)
        self.audio_client = SimplifiedAsyncOpenAI(
            api_key=EnvUtils.get_env("UTU_AUDIO_LLM_API_KEY"),  # NOTE: you should set these envs in .env
            base_url=EnvUtils.get_env("UTU_AUDIO_LLM_BASE_URL"),
        )
        self.audio_model = EnvUtils.get_env("UTU_AUDIO_LLM_MODEL")
        self.llm = SimplifiedAsyncOpenAI(**config.config_llm.model_provider.model_dump())
        self.md5_to_path = {}

    @async_file_cache(expire_time=None)
    async def transcribe(self, md5: str) -> dict:
        # model: gpt-4o-transcribe, gpt-4o-mini-transcribe, and whisper-1
        fn = self.md5_to_path[md5]
        transcript: TranscriptionVerbose = await self.audio_client.audio.transcriptions.create(
            model=self.audio_model,
            file=open(fn, "rb"),
            response_format="verbose_json",
            timestamp_granularities=["segment"],
        )
        return transcript.model_dump()

    def handle_path(self, path: str) -> str:
        md5 = FileUtils.get_file_md5(path)
        if FileUtils.is_web_url(path):
            # download audio to data/_audio, with md5
            fn = DIR_ROOT / "data" / "_audio" / f"{md5}{FileUtils.get_file_ext(path)}"
            fn.parent.mkdir(parents=True, exist_ok=True)
            if not fn.exists():
                path = FileUtils.download_file(path, fn)
                logger.info(f"Downloaded audio file to {path}")
            path = fn
        self.md5_to_path[md5] = path  # record md5 to map
        return md5

    @register_tool
    async def audio_qa(self, audio_path: str, question: str) -> str:
        """Asks a question about the audio and gets an answer.

        Args:
            audio_path (str): The path or URL to the audio file.
            question (str): The question to ask about the audio.
        """
        logger.debug(f"Processing audio file `{audio_path}` with question `{question}`.")
        md5 = self.handle_path(audio_path)
        res = await self.transcribe(md5)

        messages = [
            {"role": "system", "content": "You are a helpful assistant specializing in audio analysis."},
            {
                "role": "user",
                "content": TOOL_PROMPTS["audio_qa"].format(
                    question=question, file=audio_path, duration=res["duration"], transcription=res["text"]
                ),
            },
        ]
        output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
        return output

tools_map property

tools_map: dict[str, Callable]

Lazy loading of tools map. - collect tools registered by @register_tool

audio_qa async

audio_qa(audio_path: str, question: str) -> str

Asks a question about the audio and gets an answer.

Parameters:

Name Type Description Default
audio_path str

The path or URL to the audio file.

required
question str

The question to ask about the audio.

required
Source code in utu/tools/audio_toolkit.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
@register_tool
async def audio_qa(self, audio_path: str, question: str) -> str:
    """Asks a question about the audio and gets an answer.

    Args:
        audio_path (str): The path or URL to the audio file.
        question (str): The question to ask about the audio.
    """
    logger.debug(f"Processing audio file `{audio_path}` with question `{question}`.")
    md5 = self.handle_path(audio_path)
    res = await self.transcribe(md5)

    messages = [
        {"role": "system", "content": "You are a helpful assistant specializing in audio analysis."},
        {
            "role": "user",
            "content": TOOL_PROMPTS["audio_qa"].format(
                question=question, file=audio_path, duration=res["duration"], transcription=res["text"]
            ),
        },
    ]
    output = await self.llm.query_one(messages=messages, **self.config.config_llm.model_params.model_dump())
    return output

get_tools_map_func

get_tools_map_func() -> dict[str, Callable]

Get tools map. It will filter tools by config.activated_tools if it is not None.

Source code in utu/tools/base.py
36
37
38
39
40
41
42
43
44
45
def get_tools_map_func(self) -> dict[str, Callable]:
    """Get tools map. It will filter tools by config.activated_tools if it is not None."""
    if self.config.activated_tools:
        assert all(tool_name in self.tools_map for tool_name in self.config.activated_tools), (
            f"Error config activated tools: {self.config.activated_tools}! available tools: {self.tools_map.keys()}"
        )
        tools_map = {tool_name: self.tools_map[tool_name] for tool_name in self.config.activated_tools}
    else:
        tools_map = self.tools_map
    return tools_map

get_tools_in_agents

get_tools_in_agents() -> list[FunctionTool]

Get tools in openai-agents format.

Source code in utu/tools/base.py
47
48
49
50
51
52
53
54
55
56
57
58
def get_tools_in_agents(self) -> list[FunctionTool]:
    """Get tools in openai-agents format."""
    tools_map = self.get_tools_map_func()
    tools = []
    for _, tool in tools_map.items():
        tools.append(
            function_tool(
                tool,
                strict_mode=False,  # turn off strict mode
            )
        )
    return tools

get_tools_in_openai

get_tools_in_openai() -> list[dict]

Get tools in OpenAI format.

Source code in utu/tools/base.py
60
61
62
63
def get_tools_in_openai(self) -> list[dict]:
    """Get tools in OpenAI format."""
    tools = self.get_tools_in_agents()
    return [ChatCompletionConverter.tool_to_openai(tool) for tool in tools]

get_tools_in_mcp

get_tools_in_mcp() -> list[Tool]

Get tools in MCP format.

Source code in utu/tools/base.py
65
66
67
68
def get_tools_in_mcp(self) -> list[types.Tool]:
    """Get tools in MCP format."""
    tools = self.get_tools_in_agents()
    return [MCPConverter.function_tool_to_mcp(tool) for tool in tools]

call_tool async

call_tool(name: str, arguments: dict) -> str

Call a tool by its name.

Source code in utu/tools/base.py
70
71
72
73
74
75
76
async def call_tool(self, name: str, arguments: dict) -> str:
    """Call a tool by its name."""
    tools_map = self.get_tools_map_func()
    if name not in tools_map:
        raise ValueError(f"Tool {name} not found")
    tool = tools_map[name]
    return await tool(**arguments)