RoneyBABA commited on
Commit
89ef5a0
·
verified ·
1 Parent(s): 66f45be

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +55 -70
  2. apt.txt +1 -0
  3. hgface_requirements.txt +6 -0
  4. model.py +70 -0
  5. patient.py +70 -0
app.py CHANGED
@@ -1,70 +1,55 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
-
23
- messages.append({"role": "user", "content": message})
24
-
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
-
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
67
-
68
-
69
- if __name__ == "__main__":
70
- demo.launch()
 
1
+ # if you dont use pipenv uncomment the following:
2
+ # from dotenv import load_dotenv
3
+ # load_dotenv()
4
+
5
+ #VoiceBot UI with Gradio
6
+ import os
7
+ import gradio as gr
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ from model import encode_image, analyze_image_with_query, analyze_query
12
+ from patient import record_audio, transcription
13
+
14
+ #load_dotenv()
15
+
16
+ system_prompt="""You are a professional doctor. Given input is the querry of patient.
17
+ What's in this image (if provided)?. Do you find anything wrong with it medically?
18
+ Suggest some quick response actions, which can be implemented immediately. Do not add any numbers or special characters in
19
+ your response. Your response should be in one long paragraph. Also always answer as if you are answering to a real person.
20
+ Donot say 'In the image I see' but say 'With what I see, I think you have ....'
21
+ Do end the response with the specialist (ex:urologist, cardiologist) the user should consult and it strictly should be the very last word of the response.
22
+ Dont respond as an AI model in markdown, your answer should mimic that of an actual doctor not an AI bot,
23
+ Keep your answer concise (max 2 sentences). No preamble, start your answer right away please"""
24
+
25
+
26
+ def process_inputs(audio_filepath, image_filepath = None):
27
+ speech_to_text_output = transcription(GROQ_API_KEY=os.environ.get("GROQ_API_KEY"),
28
+ audio_filepath=audio_filepath,
29
+ stt_model="whisper-large-v3")
30
+
31
+ if not image_filepath:
32
+ doctor_response = analyze_query(query=system_prompt+speech_to_text_output, model="meta-llama/llama-4-scout-17b-16e-instruct")
33
+ else:
34
+ doctor_response = analyze_image_with_query(query=system_prompt + speech_to_text_output,encoded_image=encode_image(image_filepath),
35
+ model="meta-llama/llama-4-scout-17b-16e-instruct")
36
+ return speech_to_text_output, doctor_response
37
+
38
+
39
+ # Create the interface
40
+ iface = gr.Interface(
41
+ fn=process_inputs,
42
+ inputs=[
43
+ gr.Audio(sources=["microphone"], type="filepath"),
44
+ gr.Image(type="filepath")
45
+ ],
46
+ outputs=[
47
+ gr.Textbox(label="Speech to Text"),
48
+ gr.Textbox(label="Doctor's Response")
49
+ ],
50
+ title="AI Doctor with Vision and Voice"
51
+ )
52
+
53
+ iface.launch(debug=True)
54
+
55
+ #http://127.0.0.1:7860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
hgface_requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=3.38
2
+ groq
3
+ python-dotenv
4
+ pydub
5
+ SpeechRecognition
6
+ requests
model.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # if you dont use pipenv uncomment the following:
2
+ # from dotenv import load_dotenv
3
+ # load_dotenv()
4
+
5
+ #Step1: Setup GROQ API key
6
+ import os
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
+
10
+ GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
11
+
12
+ #Step2: Convert image to required format
13
+ import base64
14
+
15
+ if GROQ_API_KEY is None:
16
+ raise ValueError("GROQ_API_KEY is not set! Add it to your environment or .env file.")
17
+
18
+ #image_path="D:/College/Major/ai-doctor-2.0-voice-and-vision/skin_rash.jpg"
19
+ def encode_image(image_path):
20
+ image_file=open(image_path, "rb")
21
+ return base64.b64encode(image_file.read()).decode('utf-8')
22
+
23
+ #Step3: Setup Multimodal LLM
24
+ from groq import Groq
25
+
26
+ model="meta-llama/llama-4-maverick-17b-128e-instruct"
27
+
28
+ def analyze_image_with_query(query, model, encoded_image):
29
+ client=Groq(api_key=GROQ_API_KEY)
30
+ messages=[
31
+ {
32
+ "role": "user",
33
+ "content": [
34
+ {
35
+ "type": "text",
36
+ "text": query
37
+ },
38
+ {
39
+ "type": "image_url",
40
+ "image_url": {
41
+ "url": f"data:image/jpeg;base64,{encoded_image}",
42
+ },
43
+ },
44
+ ],
45
+ }]
46
+ chat_completion=client.chat.completions.create(
47
+ messages=messages,
48
+ model=model
49
+ )
50
+
51
+ return (chat_completion.choices[0].message.content)
52
+
53
+ def analyze_query(query, model):
54
+ client=Groq(api_key=GROQ_API_KEY)
55
+ messages=[
56
+ {
57
+ "role": "user",
58
+ "content": [
59
+ {
60
+ "type": "text",
61
+ "text": query
62
+ },
63
+ ],
64
+ }]
65
+ chat_completion=client.chat.completions.create(
66
+ messages=messages,
67
+ model=model
68
+ )
69
+
70
+ return (chat_completion.choices[0].message.content)
patient.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # if you dont use pipenv uncomment the following:
2
+ # from dotenv import load_dotenv
3
+ # load_dotenv()
4
+
5
+ #Step1: Setup Audio recorder (ffmpeg & portaudio)
6
+ # ffmpeg, portaudio, pyaudio
7
+ import logging
8
+ import speech_recognition as sr
9
+ from pydub import AudioSegment
10
+ from io import BytesIO
11
+
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
+
14
+ def record_audio(file_path, timeout=5, phrase_time_limit=10):
15
+ """
16
+ Simplified function to record audio from the microphone and save it as an MP3 file.
17
+
18
+ Args:
19
+ file_path (str): Path to save the recorded audio file.
20
+ timeout (int): Maximum time to wait for a phrase to start (in seconds).
21
+ phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
22
+ """
23
+ recognizer = sr.Recognizer()
24
+
25
+ try:
26
+ with sr.Microphone() as source:
27
+ logging.info("Adjusting for ambient noise...")
28
+ recognizer.adjust_for_ambient_noise(source, duration=1)
29
+ logging.info("Start speaking now...")
30
+
31
+ # Record the audio
32
+ logging.info(f"Recording for {phrase_time_limit} seconds...")
33
+ audio_data = recognizer.record(source, duration=phrase_time_limit)
34
+ # audio_data = recognizer.listen(source, )
35
+ logging.info("Recording complete.")
36
+
37
+ # Convert the recorded audio to an MP3 file
38
+ wav_data = audio_data.get_wav_data()
39
+ audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
40
+ audio_segment.export(file_path, format="mp3", bitrate="128k")
41
+
42
+ logging.info(f"Audio saved to {file_path}")
43
+
44
+ except Exception as e:
45
+ logging.error(f"An error occurred: {e}")
46
+
47
+ audio_filepath="patient_message.mp3"
48
+
49
+ #Step2: Setup Speech to text–STT–model for transcription
50
+ def transcription(stt_model, audio_filepath, GROQ_API_KEY):
51
+ import os
52
+ from groq import Groq
53
+ from dotenv import load_dotenv
54
+ load_dotenv()
55
+
56
+ GROQ_API_KEY=os.environ.get("GROQ_API_KEY")
57
+ stt_model="whisper-large-v3-turbo"
58
+
59
+ if GROQ_API_KEY is None:
60
+ raise ValueError("GROQ_API_KEY is not set! Add it to your environment or .env file.")
61
+ client=Groq(api_key=GROQ_API_KEY)
62
+
63
+ audio_file=open(audio_filepath, "rb")
64
+ transcription=client.audio.transcriptions.create(
65
+ model=stt_model,
66
+ file=audio_file,
67
+ language="en"
68
+ )
69
+
70
+ return transcription.text