Upload spatializer/utils/text_parser.py with huggingface_hub

343bd34 verified 6 months ago

5.6 kB

	"""Text parsing utilities for spatial directions."""

	import re
	from typing import Dict, Tuple, Optional
	import numpy as np


	# Spatial ontology (from config)
	DIRECTION_BINS = {
	"front": 0,
	"front-left": 45,
	"frontleft": 45,
	"left": 90,
	"back-left": 135,
	"backleft": 135,
	"back": 180,
	"back-right": -135,
	"backright": -135,
	"right": -90,
	"front-right": -45,
	"frontright": -45,
	}

	ELEVATION_BINS = {
	"down": -30,
	"below": -30,
	"lower": -30,
	"level": 0,
	"middle": 0,
	"center": 0,
	"up": 30,
	"above": 30,
	"upper": 30,
	}

	DISTANCE_BINS = {
	"near": 1.0,
	"close": 1.0,
	"mid": 2.5,
	"medium": 2.5,
	"far": 5.0,
	"distant": 5.0,
	}

	ROOM_SIZE_BINS = {
	"small": "small",
	"medium": "medium",
	"large": "large",
	}

	REVERB_BINS = {
	"dry": "dry",
	"medium": "medium",
	"wet": "wet",
	}


	def parse_spatial_text(text: str) -> Dict[str, any]:
	"""
	Parse spatial text description into parameters.

	Args:
	text: Text like "front-left, up, near, small room, dry"

	Returns:
	Dictionary with keys:
	- azimuth_deg: float
	- elevation_deg: float
	- distance_m: float
	- room_size: str
	- reverb_level: str
	"""
	text_lower = text.lower().strip()

	# Defaults
	params = {
	"azimuth_deg": 0.0,
	"elevation_deg": 0.0,
	"distance_m": 2.5,
	"room_size": "medium",
	"reverb_level": "medium",
	}

	# Parse direction (azimuth)
	for direction, angle in DIRECTION_BINS.items():
	if direction in text_lower:
	params["azimuth_deg"] = float(angle)
	break

	# Parse elevation
	for elevation, angle in ELEVATION_BINS.items():
	if elevation in text_lower:
	params["elevation_deg"] = float(angle)
	break

	# Parse distance
	for distance, dist_m in DISTANCE_BINS.items():
	if distance in text_lower:
	params["distance_m"] = dist_m
	break

	# Parse room size
	for room_size in ROOM_SIZE_BINS.keys():
	if room_size in text_lower:
	params["room_size"] = room_size
	break

	# Parse reverb level
	for reverb in REVERB_BINS.keys():
	if reverb in text_lower:
	params["reverb_level"] = reverb
	break

	return params


	def generate_random_spatial_text() -> Tuple[str, Dict[str, any]]:
	"""
	Generate random spatial text and corresponding parameters.

	Returns:
	(text, params_dict)
	"""
	# Random sampling
	direction = np.random.choice(list(DIRECTION_BINS.keys()))
	elevation_keys = ["down", "level", "up"]
	elevation = np.random.choice(elevation_keys)
	distance_keys = ["near", "mid", "far"]
	distance = np.random.choice(distance_keys)
	room_size = np.random.choice(["small", "medium", "large"])
	reverb = np.random.choice(["dry", "medium", "wet"])

	# Build text
	text = f"{direction}, {elevation}, {distance}, {room_size} room, {reverb}"

	# Get params
	params = {
	"azimuth_deg": float(DIRECTION_BINS[direction]),
	"elevation_deg": float(ELEVATION_BINS[elevation]),
	"distance_m": DISTANCE_BINS[distance],
	"room_size": room_size,
	"reverb_level": reverb,
	}

	return text, params


	def params_to_bins(params: Dict[str, any]) -> Dict[str, int]:
	"""
	Convert continuous parameters to bin indices.

	Args:
	params: Dict with azimuth_deg, elevation_deg, distance_m, etc.

	Returns:
	Dict with bin indices
	"""
	# Direction bin (8 bins)
	azimuth = params["azimuth_deg"]
	direction_angles = [0, 45, 90, 135, 180, -135, -90, -45]
	direction_bin = np.argmin([abs(azimuth - a) for a in direction_angles])

	# Elevation bin (3 bins)
	elevation = params["elevation_deg"]
	elevation_angles = [-30, 0, 30]
	elevation_bin = np.argmin([abs(elevation - a) for a in elevation_angles])

	# Distance bin (3 bins)
	distance = params["distance_m"]
	distance_values = [1.0, 2.5, 5.0]
	distance_bin = np.argmin([abs(distance - d) for d in distance_values])

	# Room size bin (3 bins)
	room_sizes = ["small", "medium", "large"]
	room_bin = room_sizes.index(params.get("room_size", "medium"))

	# Reverb bin (3 bins)
	reverb_levels = ["dry", "medium", "wet"]
	reverb_bin = reverb_levels.index(params.get("reverb_level", "medium"))

	return {
	"direction_bin": direction_bin,
	"elevation_bin": elevation_bin,
	"distance_bin": distance_bin,
	"room_bin": room_bin,
	"reverb_bin": reverb_bin,
	}


	def bins_to_one_hot(bins: Dict[str, int]) -> np.ndarray:
	"""
	Convert bin indices to concatenated one-hot encoding.

	Args:
	bins: Dict with bin indices

	Returns:
	One-hot vector of shape (8 + 3 + 3 + 3 + 3 = 20,)
	"""
	direction_oh = np.zeros(8)
	direction_oh[bins["direction_bin"]] = 1.0

	elevation_oh = np.zeros(3)
	elevation_oh[bins["elevation_bin"]] = 1.0

	distance_oh = np.zeros(3)
	distance_oh[bins["distance_bin"]] = 1.0

	room_oh = np.zeros(3)
	room_oh[bins["room_bin"]] = 1.0

	reverb_oh = np.zeros(3)
	reverb_oh[bins["reverb_bin"]] = 1.0

	return np.concatenate([direction_oh, elevation_oh, distance_oh, room_oh, reverb_oh])