Spaces:
Sleeping
Sleeping
| """Voice Input Service for Speech Recognition""" | |
| import speech_recognition as sr | |
| from typing import Optional, Callable | |
| import io | |
| class VoiceInputService: | |
| """Service for handling voice input and speech recognition""" | |
| def __init__(self): | |
| """Initialize voice input service""" | |
| self.recognizer = sr.Recognizer() | |
| self.microphone = None | |
| self.available = False | |
| try: | |
| # Try to initialize microphone (requires pyaudio) | |
| self.microphone = sr.Microphone() | |
| # Adjust for ambient noise | |
| with self.microphone as source: | |
| self.recognizer.adjust_for_ambient_noise(source, duration=0.5) | |
| self.available = True | |
| except OSError as e: | |
| print(f"Warning: Could not initialize microphone: {e}") | |
| print("Voice input will not be available") | |
| print("To enable voice input, install PortAudio:") | |
| print(" macOS: brew install portaudio") | |
| print(" Linux: sudo apt-get install portaudio19-dev") | |
| print(" Then: pip install pyaudio") | |
| self.available = False | |
| except Exception as e: | |
| print(f"Warning: Could not initialize microphone: {e}") | |
| print("Voice input will not be available") | |
| self.available = False | |
| def listen_and_recognize(self, timeout: int = 5, phrase_time_limit: int = 10) -> Optional[str]: | |
| """ | |
| Listen to microphone and recognize speech | |
| Args: | |
| timeout: Maximum time to wait for speech to start | |
| phrase_time_limit: Maximum time for a phrase | |
| Returns: | |
| Recognized text or None if error | |
| """ | |
| if not self.available or not self.microphone: | |
| return None | |
| try: | |
| with self.microphone as source: | |
| print("Listening... Speak now!") | |
| audio = self.recognizer.listen( | |
| source, | |
| timeout=timeout, | |
| phrase_time_limit=phrase_time_limit | |
| ) | |
| print("Processing speech...") | |
| # Use Google's free speech recognition API | |
| text = self.recognizer.recognize_google(audio) | |
| print(f"Recognized: {text}") | |
| return text | |
| except sr.WaitTimeoutError: | |
| print("No speech detected within timeout") | |
| return None | |
| except sr.UnknownValueError: | |
| print("Could not understand audio") | |
| return None | |
| except sr.RequestError as e: | |
| print(f"Error with speech recognition service: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Error during voice recognition: {e}") | |
| return None | |
| def process_song_request(self, recognized_text: str) -> dict: | |
| """ | |
| Process a song request from recognized speech | |
| Args: | |
| recognized_text: Text recognized from speech | |
| Returns: | |
| Dictionary with song request details | |
| """ | |
| text_lower = recognized_text.lower() | |
| # Extract keywords | |
| request = { | |
| "original_text": recognized_text, | |
| "action": None, | |
| "song": None, | |
| "artist": None, | |
| "genre": None, | |
| "mood": None | |
| } | |
| # Remove common action words to get the actual query | |
| # Order matters - longer phrases first | |
| action_phrases = [ | |
| "i want to hear", "i want to", "want to hear", | |
| "i'd like to hear", "i would like to hear", | |
| "play", "put on", "listen to", "i want", | |
| "can you", "please", "i'd like", "i would like" | |
| ] | |
| cleaned_text = recognized_text.lower() | |
| for phrase in action_phrases: | |
| if phrase in cleaned_text: | |
| cleaned_text = cleaned_text.replace(phrase, "").strip() | |
| break # Only remove one phrase | |
| # Clean up extra spaces and remove standalone "i", "a", "the" | |
| words = cleaned_text.split() | |
| words = [w for w in words if w not in ["i", "a", "an", "the"]] | |
| cleaned_text = " ".join(words).strip() | |
| # Detect action | |
| if any(word in text_lower for word in ["play", "put on", "listen to", "want to hear"]): | |
| request["action"] = "play" | |
| elif any(word in text_lower for word in ["skip", "next", "change"]): | |
| request["action"] = "skip" | |
| else: | |
| request["action"] = "play" # Default | |
| # Try to extract song/artist/genre | |
| # Simple keyword extraction - can be enhanced with NLP | |
| if "by" in text_lower: | |
| parts = text_lower.split("by") | |
| if len(parts) == 2: | |
| request["song"] = parts[0].strip() | |
| request["artist"] = parts[1].strip() | |
| else: | |
| # If no "by", treat the cleaned text as the song/query | |
| # But remove genre/mood words that are already extracted | |
| song_text = cleaned_text if cleaned_text else recognized_text | |
| if request.get("genre"): | |
| # Remove genre from song text | |
| song_text = song_text.replace(request["genre"], "").strip() | |
| if request.get("mood"): | |
| # Remove mood from song text | |
| song_text = song_text.replace(request["mood"], "").strip() | |
| song_text = " ".join(song_text.split()) # Clean up spaces | |
| request["song"] = song_text if song_text else recognized_text | |
| # Check for genre keywords - first try known genres, then extract custom ones | |
| known_genres = ["pop", "rock", "jazz", "classical", "electronic", "hip-hop", "hip hop", "country", "indie", "rap", "blues", "folk"] | |
| genre_found = False | |
| # First, check for known genres | |
| for genre in known_genres: | |
| if genre in text_lower: | |
| request["genre"] = genre | |
| genre_found = True | |
| break | |
| # If no known genre found, try to extract a custom genre | |
| if not genre_found: | |
| # Look for patterns like "some [genre] music", "[genre] music", "play [genre]" | |
| # Remove action words and common words to find potential genre | |
| genre_indicators = ["music", "song", "track", "tune"] | |
| words = cleaned_text.split() | |
| # Find words that might be genres (not action words, not common words) | |
| common_words = {"i", "want", "to", "hear", "play", "put", "on", "listen", "some", "a", "an", "the", "me", "my"} | |
| # Look for genre-like words (usually before "music" or standalone) | |
| for i, word in enumerate(words): | |
| # If word is before "music" or similar, it might be a genre | |
| if i < len(words) - 1 and words[i + 1] in genre_indicators: | |
| if word not in common_words and len(word) > 2: | |
| request["genre"] = word | |
| genre_found = True | |
| break | |
| # Or if it's a standalone word that's not a common word | |
| elif word not in common_words and len(word) > 3 and word not in known_genres: | |
| # Check if it looks like a genre (not a song/artist name pattern) | |
| # Simple heuristic: if it's a single word and not capitalized in original, might be genre | |
| if word in text_lower and not word[0].isupper() if word[0].isalpha() else False: | |
| # Additional check: if user said "some [word]" or "[word] music", likely a genre | |
| if i > 0 and words[i-1] in ["some", "any", "a", "an"]: | |
| request["genre"] = word | |
| genre_found = True | |
| break | |
| elif i < len(words) - 1 and words[i+1] in genre_indicators: | |
| request["genre"] = word | |
| genre_found = True | |
| break | |
| # Check for mood keywords | |
| moods = ["happy", "sad", "energetic", "calm", "relaxed", "focused", "upbeat", "chill"] | |
| for mood in moods: | |
| if mood in text_lower: | |
| request["mood"] = mood | |
| break | |
| return request | |