Spaces:

RoneyBABA
/

MajorProj

Running

App Files Files Community

RoneyBABA commited on 4 days ago

Commit

89ef5a0

verified ·

1 Parent(s): 66f45be

Upload 5 files

Browse files

Files changed (5) hide show

app.py +55 -70
apt.txt +1 -0
hgface_requirements.txt +6 -0
model.py +70 -0
patient.py +70 -0

app.py CHANGED Viewed

@@ -1,70 +1,55 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
-if __name__ == "__main__":
-    demo.launch()

+# if you dont use pipenv uncomment the following:
+# from dotenv import load_dotenv
+# load_dotenv()
+#VoiceBot UI with Gradio
+import os
+import gradio as gr
+from dotenv import load_dotenv
+load_dotenv()
+from model import encode_image, analyze_image_with_query, analyze_query
+from patient import record_audio, transcription
+#load_dotenv()
+system_prompt="""You are a professional doctor. Given input is the querry of patient.
+            What's in this image (if provided)?. Do you find anything wrong with it medically?
+            Suggest some quick response actions, which can be implemented immediately. Do not add any numbers or special characters in
+            your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
+            Donot say 'In the image I see' but say 'With what I see, I think you have ....'
+            Do end the response with the specialist (ex:urologist, cardiologist) the user should consult and it strictly should be the very last word of the response.
+            Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
+            Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
+def process_inputs(audio_filepath, image_filepath = None):
+    speech_to_text_output = transcription(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
+                                                 audio_filepath=audio_filepath,
+                                                 stt_model="whisper-large-v3")
+    if not image_filepath:
+      doctor_response = analyze_query(query=system_prompt+speech_to_text_output, model="meta-llama/llama-4-scout-17b-16e-instruct")
+    else:
+     doctor_response = analyze_image_with_query(query=system_prompt + speech_to_text_output,encoded_image=encode_image(image_filepath),
+        model="meta-llama/llama-4-scout-17b-16e-instruct")
+    return speech_to_text_output, doctor_response
+# Create the interface
+iface = gr.Interface(
+    fn=process_inputs,
+    inputs=[
+        gr.Audio(sources=["microphone"], type="filepath"),
+        gr.Image(type="filepath")
+    ],
+    outputs=[
+        gr.Textbox(label="Speech to Text"),
+        gr.Textbox(label="Doctor's Response")
+    ],
+    title="AI Doctor with Vision and Voice"
+    )
+iface.launch(debug=True)
+#http://127.0.0.1:7860

apt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

hgface_requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=3.38
+groq
+python-dotenv
+pydub
+SpeechRecognition
+requests

model.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# if you dont use pipenv uncomment the following:
+# from dotenv import load_dotenv
+# load_dotenv()
+#Step1: Setup GROQ API key
+import os
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
+#Step2: Convert image to required format
+import base64
+if GROQ_API_KEY is None:
+    raise ValueError("GROQ_API_KEY is not set! Add it to your environment or .env file.")
+#image_path="D:/College/Major/ai-doctor-2.0-voice-and-vision/skin_rash.jpg"
+def encode_image(image_path):
+    image_file=open(image_path, "rb")
+    return base64.b64encode(image_file.read()).decode('utf-8')
+#Step3: Setup Multimodal LLM
+from groq import Groq
+model="meta-llama/llama-4-maverick-17b-128e-instruct"
+def analyze_image_with_query(query, model, encoded_image):
+    client=Groq(api_key=GROQ_API_KEY)
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": query
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                    },
+                },
+            ],
+        }]
+    chat_completion=client.chat.completions.create(
+        messages=messages,
+        model=model
+    )
+    return (chat_completion.choices[0].message.content)
+def analyze_query(query, model):
+    client=Groq(api_key=GROQ_API_KEY)
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": query
+                },
+            ],
+        }]
+    chat_completion=client.chat.completions.create(
+        messages=messages,
+        model=model
+    )
+    return (chat_completion.choices[0].message.content)

patient.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# if you dont use pipenv uncomment the following:
+# from dotenv import load_dotenv
+# load_dotenv()
+#Step1: Setup Audio recorder (ffmpeg & portaudio)
+# ffmpeg, portaudio, pyaudio
+import logging
+import speech_recognition as sr
+from pydub import AudioSegment
+from io import BytesIO
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def record_audio(file_path, timeout=5, phrase_time_limit=10):
+    """
+    Simplified function to record audio from the microphone and save it as an MP3 file.
+    Args:
+    file_path (str): Path to save the recorded audio file.
+    timeout (int): Maximum time to wait for a phrase to start (in seconds).
+    phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
+    """
+    recognizer = sr.Recognizer()
+    try:
+        with sr.Microphone() as source:
+            logging.info("Adjusting for ambient noise...")
+            recognizer.adjust_for_ambient_noise(source, duration=1)
+            logging.info("Start speaking now...")
+            # Record the audio
+            logging.info(f"Recording for {phrase_time_limit} seconds...")
+            audio_data = recognizer.record(source, duration=phrase_time_limit)
+            # audio_data = recognizer.listen(source,  )
+            logging.info("Recording complete.")
+            # Convert the recorded audio to an MP3 file
+            wav_data = audio_data.get_wav_data()
+            audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
+            audio_segment.export(file_path, format="mp3", bitrate="128k")
+            logging.info(f"Audio saved to {file_path}")
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+audio_filepath="patient_message.mp3"
+#Step2: Setup Speech to text–STT–model for transcription
+def transcription(stt_model, audio_filepath, GROQ_API_KEY):
+    import os
+    from groq import Groq
+    from dotenv import load_dotenv
+    load_dotenv()
+    GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
+    stt_model="whisper-large-v3-turbo"
+    if GROQ_API_KEY is None:
+        raise ValueError("GROQ_API_KEY is not set! Add it to your environment or .env file.")
+    client=Groq(api_key=GROQ_API_KEY)
+    audio_file=open(audio_filepath, "rb")
+    transcription=client.audio.transcriptions.create(
+        model=stt_model,
+        file=audio_file,
+        language="en"
+    )
+    return transcription.text