AI-RADIO / src /voice_input.py
Nikita Makarov
ai radio with sample auth
61f40a7
"""Voice Input Service for Speech Recognition"""
import speech_recognition as sr
from typing import Optional, Callable
import io
class VoiceInputService:
"""Service for handling voice input and speech recognition"""
def __init__(self):
"""Initialize voice input service"""
self.recognizer = sr.Recognizer()
self.microphone = None
self.available = False
try:
# Try to initialize microphone (requires pyaudio)
self.microphone = sr.Microphone()
# Adjust for ambient noise
with self.microphone as source:
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
self.available = True
except OSError as e:
print(f"Warning: Could not initialize microphone: {e}")
print("Voice input will not be available")
print("To enable voice input, install PortAudio:")
print(" macOS: brew install portaudio")
print(" Linux: sudo apt-get install portaudio19-dev")
print(" Then: pip install pyaudio")
self.available = False
except Exception as e:
print(f"Warning: Could not initialize microphone: {e}")
print("Voice input will not be available")
self.available = False
def listen_and_recognize(self, timeout: int = 5, phrase_time_limit: int = 10) -> Optional[str]:
"""
Listen to microphone and recognize speech
Args:
timeout: Maximum time to wait for speech to start
phrase_time_limit: Maximum time for a phrase
Returns:
Recognized text or None if error
"""
if not self.available or not self.microphone:
return None
try:
with self.microphone as source:
print("Listening... Speak now!")
audio = self.recognizer.listen(
source,
timeout=timeout,
phrase_time_limit=phrase_time_limit
)
print("Processing speech...")
# Use Google's free speech recognition API
text = self.recognizer.recognize_google(audio)
print(f"Recognized: {text}")
return text
except sr.WaitTimeoutError:
print("No speech detected within timeout")
return None
except sr.UnknownValueError:
print("Could not understand audio")
return None
except sr.RequestError as e:
print(f"Error with speech recognition service: {e}")
return None
except Exception as e:
print(f"Error during voice recognition: {e}")
return None
def process_song_request(self, recognized_text: str) -> dict:
"""
Process a song request from recognized speech
Args:
recognized_text: Text recognized from speech
Returns:
Dictionary with song request details
"""
text_lower = recognized_text.lower()
# Extract keywords
request = {
"original_text": recognized_text,
"action": None,
"song": None,
"artist": None,
"genre": None,
"mood": None
}
# Remove common action words to get the actual query
# Order matters - longer phrases first
action_phrases = [
"i want to hear", "i want to", "want to hear",
"i'd like to hear", "i would like to hear",
"play", "put on", "listen to", "i want",
"can you", "please", "i'd like", "i would like"
]
cleaned_text = recognized_text.lower()
for phrase in action_phrases:
if phrase in cleaned_text:
cleaned_text = cleaned_text.replace(phrase, "").strip()
break # Only remove one phrase
# Clean up extra spaces and remove standalone "i", "a", "the"
words = cleaned_text.split()
words = [w for w in words if w not in ["i", "a", "an", "the"]]
cleaned_text = " ".join(words).strip()
# Detect action
if any(word in text_lower for word in ["play", "put on", "listen to", "want to hear"]):
request["action"] = "play"
elif any(word in text_lower for word in ["skip", "next", "change"]):
request["action"] = "skip"
else:
request["action"] = "play" # Default
# Try to extract song/artist/genre
# Simple keyword extraction - can be enhanced with NLP
if "by" in text_lower:
parts = text_lower.split("by")
if len(parts) == 2:
request["song"] = parts[0].strip()
request["artist"] = parts[1].strip()
else:
# If no "by", treat the cleaned text as the song/query
# But remove genre/mood words that are already extracted
song_text = cleaned_text if cleaned_text else recognized_text
if request.get("genre"):
# Remove genre from song text
song_text = song_text.replace(request["genre"], "").strip()
if request.get("mood"):
# Remove mood from song text
song_text = song_text.replace(request["mood"], "").strip()
song_text = " ".join(song_text.split()) # Clean up spaces
request["song"] = song_text if song_text else recognized_text
# Check for genre keywords - first try known genres, then extract custom ones
known_genres = ["pop", "rock", "jazz", "classical", "electronic", "hip-hop", "hip hop", "country", "indie", "rap", "blues", "folk"]
genre_found = False
# First, check for known genres
for genre in known_genres:
if genre in text_lower:
request["genre"] = genre
genre_found = True
break
# If no known genre found, try to extract a custom genre
if not genre_found:
# Look for patterns like "some [genre] music", "[genre] music", "play [genre]"
# Remove action words and common words to find potential genre
genre_indicators = ["music", "song", "track", "tune"]
words = cleaned_text.split()
# Find words that might be genres (not action words, not common words)
common_words = {"i", "want", "to", "hear", "play", "put", "on", "listen", "some", "a", "an", "the", "me", "my"}
# Look for genre-like words (usually before "music" or standalone)
for i, word in enumerate(words):
# If word is before "music" or similar, it might be a genre
if i < len(words) - 1 and words[i + 1] in genre_indicators:
if word not in common_words and len(word) > 2:
request["genre"] = word
genre_found = True
break
# Or if it's a standalone word that's not a common word
elif word not in common_words and len(word) > 3 and word not in known_genres:
# Check if it looks like a genre (not a song/artist name pattern)
# Simple heuristic: if it's a single word and not capitalized in original, might be genre
if word in text_lower and not word[0].isupper() if word[0].isalpha() else False:
# Additional check: if user said "some [word]" or "[word] music", likely a genre
if i > 0 and words[i-1] in ["some", "any", "a", "an"]:
request["genre"] = word
genre_found = True
break
elif i < len(words) - 1 and words[i+1] in genre_indicators:
request["genre"] = word
genre_found = True
break
# Check for mood keywords
moods = ["happy", "sad", "energetic", "calm", "relaxed", "focused", "upbeat", "chill"]
for mood in moods:
if mood in text_lower:
request["mood"] = mood
break
return request