File size: 3,647 Bytes

f71bc95

from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict, List

from .constants import DEFAULT_ESPEAK_VOICE, EMOTION_TO_SYMBOL, INTENSITY_SYMBOLS


@dataclass(frozen=True)
class PreparedInput:
    text: str
    phonemes: List[str]
    token_ids: List[int]
    emotion: str
    intensity: float
    emotion_symbol: str
    intensity_symbol: str


def clamp_unit(value: float) -> float:
    if value != value:
        return 0.0

    if value < 0.0:
        return 0.0

    if value > 1.0:
        return 1.0

    return float(value)


def load_token_map(config: dict[str, Any]) -> Dict[str, int]:
    phoneme_id_map = config.get("phoneme_id_map")
    if not isinstance(phoneme_id_map, dict):
        raise KeyError("config.json is missing phoneme_id_map")

    token_map: Dict[str, int] = {}

    for symbol, raw_value in phoneme_id_map.items():
        if isinstance(raw_value, int):
            token_map[symbol] = raw_value
            continue

        if isinstance(raw_value, list) and len(raw_value) == 1:
            token_map[symbol] = int(raw_value[0])
            continue

        raise ValueError(
            f"Unsupported token mapping for symbol {symbol!r}: expected int or single-item list"
        )

    return token_map


def intensity_to_symbol(intensity: float) -> str:
    value = clamp_unit(intensity)
    idx = int(value * len(INTENSITY_SYMBOLS))
    idx = max(0, min(idx, len(INTENSITY_SYMBOLS) - 1))
    return INTENSITY_SYMBOLS[idx]


def normalize_emotion(emotion: str | None) -> str:
    value = (emotion or "neutral").strip().lower()
    if value not in EMOTION_TO_SYMBOL:
        raise ValueError(
            f"Unsupported emotion {emotion!r}. Expected one of: {', '.join(EMOTION_TO_SYMBOL)}"
        )

    return value


def phonemize_full_utterance(text: str, espeak_voice: str = DEFAULT_ESPEAK_VOICE) -> List[str]:
    try:
        from piper_phonemize import phonemize_espeak
    except ImportError as exc:
        raise ImportError(
            "wfloat-tts requires piper-phonemize for phonemization. "
            "Install it with: pip install \"piper-phonemize==1.3.0\" "
            "-f https://k2-fsa.github.io/icefall/piper_phonemize"
        ) from exc

    sentence_groups = phonemize_espeak(text, espeak_voice)
    phonemes: List[str] = []

    for group in sentence_groups:
        if not group:
            continue

        if phonemes:
            phonemes.append(" ")

        phonemes.extend(group)

    return phonemes


def prepare_input(
    text: str,
    config: dict[str, Any],
    emotion: str = "neutral",
    intensity: float = 0.5,
    espeak_voice: str = DEFAULT_ESPEAK_VOICE,
) -> PreparedInput:
    normalized_emotion = normalize_emotion(emotion)
    normalized_intensity = clamp_unit(intensity)

    phonemes = phonemize_full_utterance(text, espeak_voice=espeak_voice)
    emotion_symbol = EMOTION_TO_SYMBOL[normalized_emotion]
    intensity_symbol = intensity_to_symbol(normalized_intensity)
    phonemes.extend([emotion_symbol, intensity_symbol])

    token_map = load_token_map(config)

    missing = [symbol for symbol in phonemes if symbol not in token_map]
    if missing:
        joined = ", ".join(sorted(set(missing)))
        raise KeyError(f"Missing symbol(s) in config.json phoneme_id_map: {joined}")

    token_ids = [token_map[symbol] for symbol in phonemes]

    return PreparedInput(
        text=text,
        phonemes=phonemes,
        token_ids=token_ids,
        emotion=normalized_emotion,
        intensity=normalized_intensity,
        emotion_symbol=emotion_symbol,
        intensity_symbol=intensity_symbol,
    )