morgankavanagh commited on
Commit
d130b8e
·
1 Parent(s): 1fb8162

Fixes of Docker, chrf, comet_hf, interface

Browse files
Files changed (4) hide show
  1. Dockerfile +6 -2
  2. evaluator/chrf.py +1 -1
  3. evaluator/comet_hf.py +33 -27
  4. interface.py +91 -85
Dockerfile CHANGED
@@ -14,11 +14,15 @@ COPY . .
14
  RUN apt-get update && apt-get install -y \
15
  git \
16
  build-essential \
17
- && rm -rf /var/lib/apt/lists/*
18
 
19
  # Install Python dependencies
20
  RUN pip install --upgrade pip
21
- RUN pip install -r requirements.txt
 
 
 
 
22
 
23
  # Expose port for Gradio
24
  EXPOSE 7860
 
14
  RUN apt-get update && apt-get install -y \
15
  git \
16
  build-essential \
17
+ && rm -rf /var/lib/apt/lists/*
18
 
19
  # Install Python dependencies
20
  RUN pip install --upgrade pip
21
+ RUN pip install --no-cache-dir -r requirements.txt
22
+ RUN pip install --no-cache-dir unbabel-comet
23
+
24
+ # Environment variables
25
+ ENV COMET_CACHE="/tmp"
26
 
27
  # Expose port for Gradio
28
  EXPOSE 7860
evaluator/chrf.py CHANGED
@@ -18,7 +18,7 @@ def calculate_chrf(
18
  :param beta:
19
  The weight of recall in the F-score. Default is 2.0.
20
  """
21
- ... # TODO
22
  def get_ngrams(text, n):
23
  """Extract character n-grams from a string."""
24
  return Counter([text[i:i+n] for i in range(len(text) - n + 1)])
 
18
  :param beta:
19
  The weight of recall in the F-score. Default is 2.0.
20
  """
21
+
22
  def get_ngrams(text, n):
23
  """Extract character n-grams from a string."""
24
  return Counter([text[i:i+n] for i in range(len(text) - n + 1)])
evaluator/comet_hf.py CHANGED
@@ -1,35 +1,41 @@
1
  import os
2
- import requests
3
-
4
- # Set the Hugging Face Inference API URL and token
5
- HF_API_URL = "https://huggingface.co/Unbabel/wmt22-comet-da"
6
-
7
- HF_API_TOKEN = os.getenv("HF_API_TOKEN") # Ensure this is set in your environment
8
 
9
  def calculate_comet(source_sentences, translations, references):
10
  """
11
- Calculate COMET scores using the Hugging Face Inference API.
12
- :param source_sentences: List of source sentences.
13
- :param translations: List of translated sentences (hypotheses).
14
- :param references: List of reference translations.
15
- :return: List of COMET scores (one score per sentence pair).
16
  """
17
- headers = {
18
- "Authorization": f"Bearer {HF_API_TOKEN}",
19
- "Content-Type": "application/json"
20
- }
 
 
 
 
 
 
 
21
 
22
- # Prepare data for the API
23
- data = [
24
- {"source": src, "translation": mt, "reference": ref}
25
- for src, mt, ref in zip(source_sentences, translations, references)
26
- ]
 
 
 
 
27
 
28
- # Make the API call
29
- response = requests.post(HF_API_URL, headers=headers, json={"inputs": data})
30
- response.raise_for_status() # Raise an error for bad responses
31
 
32
- # Parse the response
33
- results = response.json()
34
- scores = [item["score"] for item in results] # Extract scores from the response
35
- return scores
 
1
  import os
2
+ import torch
3
+ from comet import download_model, load_from_checkpoint
 
 
 
 
4
 
5
  def calculate_comet(source_sentences, translations, references):
6
  """
7
+ Calculate COMET scores using the local COMET installation.
8
+ :param source_sentences: List of source sentences
9
+ :param translations: List of translated sentences
10
+ :param references: List of reference translations
11
+ :return: List of COMET scores
12
  """
13
+ try:
14
+ # Download and load the COMET model
15
+ # Set cache directory explicitly
16
+ os.environ["COMET_CACHE"] = "/tmp"
17
+ # Download and load the COMET model
18
+ model_path = download_model("Unbabel/wmt22-comet-da")
19
+ model = load_from_checkpoint(model_path)
20
+
21
+ # Check for GPU availability
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ model.to(device)
24
 
25
+ # Prepare data in COMET format
26
+ data = [
27
+ {
28
+ "src": src,
29
+ "mt": mt,
30
+ "ref": ref
31
+ }
32
+ for src, mt, ref in zip(source_sentences, translations, references)
33
+ ]
34
 
35
+ # Get predictions (use GPU if available)
36
+ results = model.predict(data, batch_size=8, gpus=1 if device == "cuda" else 0)
37
+ return results["scores"]
38
 
39
+ except Exception as e:
40
+ print(f"COMET Error: {str(e)}")
41
+ return [0.0] * len(source_sentences)
 
interface.py CHANGED
@@ -3,7 +3,7 @@ import requests
3
  import json
4
  import os
5
  from evaluator.chrf import calculate_chrf
6
- from evaluator.comet_hf import calculate_comet # Import the COMET function
7
  from pathlib import Path
8
 
9
  # OpenAI API URL and key
@@ -15,88 +15,95 @@ CHATGPT_MODELS = {
15
  }
16
 
17
  def improve_translations(system_prompt, temperature, top_p):
18
- # Load data
19
- data_dir = Path(__file__).parent / "evaluator" / "mt_data"
20
- source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
21
- beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
22
- reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
23
-
24
- improved_translations = []
25
- sentence_pairs = [] # To store source, draft 1, draft 2, and reference
26
-
27
- for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
28
- # Construct the prompt
29
- user_prompt = f"""
30
- As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
31
- Source: {source}
32
- Target: {target}
33
- Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
34
- """
35
-
36
- # Prepare API payload
37
- payload = {
38
- "model": CHATGPT_MODELS["GPT-4"],
39
- "messages": [
40
- {"role": "system", "content": system_prompt},
41
- {"role": "user", "content": user_prompt}
42
- ],
43
- "temperature": temperature,
44
- "top_p": top_p,
45
- "max_tokens": 512
46
- }
47
-
48
- headers = {
49
- "Authorization": f"Bearer {OPENAI_API_KEY}",
50
- "Content-Type": "application/json"
51
- }
52
-
53
- # Call OpenAI API
54
- response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
55
- response.raise_for_status()
56
- data = response.json()
57
-
58
- # Extract improved translation
59
- output = data["choices"][0]["message"]["content"]
60
- improved_translation = output.split("Improved Translation:")[-1].strip()
61
- improved_translations.append(improved_translation)
62
-
63
- # Add sentence pair to the list
64
- sentence_pairs.append([source, target, improved_translation, reference])
65
-
66
- # Calculate ChrF scores
67
- beam_chrf_scores = [
68
- calculate_chrf(beam_translation, reference)
69
- for beam_translation, reference in zip(beam_search_translations, reference_translations)
70
- ]
71
- improved_chrf_scores = [
72
- calculate_chrf(improved_translation, reference)
73
- for improved_translation, reference in zip(improved_translations, reference_translations)
74
- ]
75
-
76
- # Calculate COMET scores
77
- beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
78
- improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
79
-
80
- # Calculate average scores
81
- average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
82
- average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
83
- average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
84
- average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
85
-
86
- # Calculate score changes
87
- chrf_change = average_improved_chrf - average_beam_chrf
88
- comet_change = average_improved_comet - average_beam_comet
89
-
90
- # Prepare dataframes
91
- sentence_pairs_df = sentence_pairs # Dataframe for sentence pairs
92
- scores_df = [
93
- ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
94
- ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
95
- ]
96
-
97
- # Return dataframes and evaluation message
98
- evaluation_message = f"ChrF Change: {(average_improved_chrf/chrf_change):.2f}%, COMET Change: {(average_improved_comet/comet_change):.2f}%"
99
- return sentence_pairs_df, scores_df, evaluation_message
 
 
 
 
 
 
 
100
 
101
  # Gradio interface
102
  app = gr.Interface(
@@ -115,6 +122,5 @@ app = gr.Interface(
115
  description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
116
  )
117
 
118
-
119
  if __name__ == "__main__":
120
- app.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import json
4
  import os
5
  from evaluator.chrf import calculate_chrf
6
+ from evaluator.comet_hf import calculate_comet
7
  from pathlib import Path
8
 
9
  # OpenAI API URL and key
 
15
  }
16
 
17
  def improve_translations(system_prompt, temperature, top_p):
18
+ if not OPENAI_API_KEY:
19
+ return [], [], "Error: OpenAI API key not found"
20
+
21
+ try:
22
+ # Load data
23
+ data_dir = Path(__file__).parent / "evaluator" / "mt_data"
24
+ source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
25
+ beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
26
+ reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
27
+
28
+ improved_translations = []
29
+ sentence_pairs = [] # To store source, draft 1, draft 2, and reference
30
+
31
+ for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
32
+ # Construct the prompt
33
+ user_prompt = f"""
34
+ As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
35
+ Source: {source}
36
+ Target: {target}
37
+ Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
38
+ """
39
+
40
+ # Prepare API payload
41
+ payload = {
42
+ "model": CHATGPT_MODELS["GPT-4"],
43
+ "messages": [
44
+ {"role": "system", "content": system_prompt},
45
+ {"role": "user", "content": user_prompt}
46
+ ],
47
+ "temperature": temperature,
48
+ "top_p": top_p,
49
+ "max_tokens": 512
50
+ }
51
+
52
+ headers = {
53
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
54
+ "Content-Type": "application/json"
55
+ }
56
+
57
+ # Call OpenAI API
58
+ response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
59
+ response.raise_for_status()
60
+ data = response.json()
61
+
62
+ # Extract improved translation
63
+ output = data["choices"][0]["message"]["content"]
64
+ improved_translation = output.strip()
65
+ improved_translations.append(improved_translation)
66
+
67
+ # Add sentence pair to the list
68
+ sentence_pairs.append([source, target, improved_translation, reference])
69
+
70
+ # Calculate ChrF scores
71
+ beam_chrf_scores = [
72
+ calculate_chrf(beam_translation, reference)
73
+ for beam_translation, reference in zip(beam_search_translations, reference_translations)
74
+ ]
75
+ improved_chrf_scores = [
76
+ calculate_chrf(improved_translation, reference)
77
+ for improved_translation, reference in zip(improved_translations, reference_translations)
78
+ ]
79
+
80
+ # Calculate COMET scores
81
+ beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
82
+ improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
83
+
84
+ # Calculate average scores
85
+ average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
86
+ average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
87
+ average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
88
+ average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
89
+
90
+ # Calculate score changes
91
+ chrf_change = average_improved_chrf - average_beam_chrf
92
+ comet_change = average_improved_comet - average_beam_comet
93
+
94
+ # Prepare dataframes
95
+ sentence_pairs_df = sentence_pairs
96
+ scores_df = [
97
+ ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
98
+ ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
99
+ ]
100
+
101
+ # Return dataframes and evaluation message without division
102
+ evaluation_message = f"ChrF Change: {chrf_change:.2f}, COMET Change: {comet_change:.2f}"
103
+ return sentence_pairs_df, scores_df, evaluation_message
104
+
105
+ except Exception as e:
106
+ return [], [], f"Error: {str(e)}"
107
 
108
  # Gradio interface
109
  app = gr.Interface(
 
122
  description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
123
  )
124
 
 
125
  if __name__ == "__main__":
126
+ app.launch(server_name="0.0.0.0", server_port=7860)