Spaces:

morgankavanagh
/

post_editing_evaluator

Sleeping

App Files Files Community

morgankavanagh commited on May 9

Commit

d130b8e

1 Parent(s): 1fb8162

Fixes of Docker, chrf, comet_hf, interface

Browse files

Files changed (4) hide show

Dockerfile +6 -2
evaluator/chrf.py +1 -1
evaluator/comet_hf.py +33 -27
interface.py +91 -85

Dockerfile CHANGED Viewed

@@ -14,11 +14,15 @@ COPY . .
 RUN apt-get update && apt-get install -y \
     git \
     build-essential \
- && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 RUN pip install --upgrade pip
-RUN pip install -r requirements.txt
 # Expose port for Gradio
 EXPOSE 7860

 RUN apt-get update && apt-get install -y \
     git \
     build-essential \
+    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir unbabel-comet
+# Environment variables
+ENV COMET_CACHE="/tmp"
 # Expose port for Gradio
 EXPOSE 7860

evaluator/chrf.py CHANGED Viewed

@@ -18,7 +18,7 @@ def calculate_chrf(
     :param beta:
         The weight of recall in the F-score. Default is 2.0.
     """
-    ...  # TODO
     def get_ngrams(text, n):
         """Extract character n-grams from a string."""
         return Counter([text[i:i+n] for i in range(len(text) - n + 1)])

     :param beta:
         The weight of recall in the F-score. Default is 2.0.
     """
     def get_ngrams(text, n):
         """Extract character n-grams from a string."""
         return Counter([text[i:i+n] for i in range(len(text) - n + 1)])

evaluator/comet_hf.py CHANGED Viewed

@@ -1,35 +1,41 @@
 import os
-import requests
-# Set the Hugging Face Inference API URL and token
-HF_API_URL = "https://huggingface.co/Unbabel/wmt22-comet-da"
-HF_API_TOKEN = os.getenv("HF_API_TOKEN")  # Ensure this is set in your environment
 def calculate_comet(source_sentences, translations, references):
     """
-    Calculate COMET scores using the Hugging Face Inference API.
-    :param source_sentences: List of source sentences.
-    :param translations: List of translated sentences (hypotheses).
-    :param references: List of reference translations.
-    :return: List of COMET scores (one score per sentence pair).
     """
-    headers = {
-        "Authorization": f"Bearer {HF_API_TOKEN}",
-        "Content-Type": "application/json"
-    }
-    # Prepare data for the API
-    data = [
-        {"source": src, "translation": mt, "reference": ref}
-        for src, mt, ref in zip(source_sentences, translations, references)
-    ]
-    # Make the API call
-    response = requests.post(HF_API_URL, headers=headers, json={"inputs": data})
-    response.raise_for_status()  # Raise an error for bad responses
-    # Parse the response
-    results = response.json()
-    scores = [item["score"] for item in results]  # Extract scores from the response
-    return scores

 import os
+import torch
+from comet import download_model, load_from_checkpoint
 def calculate_comet(source_sentences, translations, references):
     """
+    Calculate COMET scores using the local COMET installation.
+    :param source_sentences: List of source sentences
+    :param translations: List of translated sentences
+    :param references: List of reference translations
+    :return: List of COMET scores
     """
+    try:
+        # Download and load the COMET model
+                # Set cache directory explicitly
+        os.environ["COMET_CACHE"] = "/tmp"
+        # Download and load the COMET model
+        model_path = download_model("Unbabel/wmt22-comet-da")
+        model = load_from_checkpoint(model_path)
+        # Check for GPU availability
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        # Prepare data in COMET format
+        data = [
+            {
+                "src": src,
+                "mt": mt,
+                "ref": ref
+            }
+            for src, mt, ref in zip(source_sentences, translations, references)
+        ]
+        # Get predictions (use GPU if available)
+        results = model.predict(data, batch_size=8, gpus=1 if device == "cuda" else 0)
+        return results["scores"]
+    except Exception as e:
+        print(f"COMET Error: {str(e)}")
+        return [0.0] * len(source_sentences)

interface.py CHANGED Viewed

@@ -3,7 +3,7 @@ import requests
 import json
 import os
 from evaluator.chrf import calculate_chrf
-from evaluator.comet_hf import calculate_comet  # Import the COMET function
 from pathlib import Path
 # OpenAI API URL and key
@@ -15,88 +15,95 @@ CHATGPT_MODELS = {
 }
 def improve_translations(system_prompt, temperature, top_p):
-    # Load data
-    data_dir = Path(__file__).parent / "evaluator" / "mt_data"
-    source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
-    beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
-    reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
-    improved_translations = []
-    sentence_pairs = []  # To store source, draft 1, draft 2, and reference
-    for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
-        # Construct the prompt
-        user_prompt = f"""
-        As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
-        Source: {source}
-        Target: {target}
-        Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
-        """
-        # Prepare API payload
-        payload = {
-            "model": CHATGPT_MODELS["GPT-4"],
-            "messages": [
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt}
-            ],
-            "temperature": temperature,
-            "top_p": top_p,
-            "max_tokens": 512
-        }
-        headers = {
-            "Authorization": f"Bearer {OPENAI_API_KEY}",
-            "Content-Type": "application/json"
-        }
-        # Call OpenAI API
-        response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
-        response.raise_for_status()
-        data = response.json()
-        # Extract improved translation
-        output = data["choices"][0]["message"]["content"]
-        improved_translation = output.split("Improved Translation:")[-1].strip()
-        improved_translations.append(improved_translation)
-        # Add sentence pair to the list
-        sentence_pairs.append([source, target, improved_translation, reference])
-    # Calculate ChrF scores
-    beam_chrf_scores = [
-        calculate_chrf(beam_translation, reference)
-        for beam_translation, reference in zip(beam_search_translations, reference_translations)
-    ]
-    improved_chrf_scores = [
-        calculate_chrf(improved_translation, reference)
-        for improved_translation, reference in zip(improved_translations, reference_translations)
-    ]
-    # Calculate COMET scores
-    beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
-    improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
-    # Calculate average scores
-    average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
-    average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
-    average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
-    average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
-    # Calculate score changes
-    chrf_change = average_improved_chrf - average_beam_chrf
-    comet_change = average_improved_comet - average_beam_comet
-    # Prepare dataframes
-    sentence_pairs_df = sentence_pairs  # Dataframe for sentence pairs
-    scores_df = [
-        ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
-        ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
-    ]
-    # Return dataframes and evaluation message
-    evaluation_message = f"ChrF Change: {(average_improved_chrf/chrf_change):.2f}%, COMET Change: {(average_improved_comet/comet_change):.2f}%"
-    return sentence_pairs_df, scores_df, evaluation_message
 # Gradio interface
 app = gr.Interface(
@@ -115,6 +122,5 @@ app = gr.Interface(
     description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
 )
 if __name__ == "__main__":
-    app.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import os
 from evaluator.chrf import calculate_chrf
+from evaluator.comet_hf import calculate_comet
 from pathlib import Path
 # OpenAI API URL and key
 }
 def improve_translations(system_prompt, temperature, top_p):
+    if not OPENAI_API_KEY:
+        return [], [], "Error: OpenAI API key not found"
+    try:
+        # Load data
+        data_dir = Path(__file__).parent / "evaluator" / "mt_data"
+        source_sentences = (data_dir / "source_sentences.txt").read_text(encoding="utf-8").splitlines()
+        beam_search_translations = (data_dir / "beam_search_translations.txt").read_text(encoding="utf-8").splitlines()
+        reference_translations = (data_dir / "reference_translations.txt").read_text(encoding="utf-8").splitlines()
+        improved_translations = []
+        sentence_pairs = []  # To store source, draft 1, draft 2, and reference
+        for source, target, reference in zip(source_sentences, beam_search_translations, reference_translations):
+            # Construct the prompt
+            user_prompt = f"""
+            As an expert translation post editor, your task is to improve the English translation (Target) for the below German text (Source)
+            Source: {source}
+            Target: {target}
+            Your output should be your improved version of the target text only. Do not add any comments or explanations before or after the improved version of the target text.
+            """
+            # Prepare API payload
+            payload = {
+                "model": CHATGPT_MODELS["GPT-4"],
+                "messages": [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                "temperature": temperature,
+                "top_p": top_p,
+                "max_tokens": 512
+            }
+            headers = {
+                "Authorization": f"Bearer {OPENAI_API_KEY}",
+                "Content-Type": "application/json"
+            }
+            # Call OpenAI API
+            response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
+            response.raise_for_status()
+            data = response.json()
+            # Extract improved translation
+            output = data["choices"][0]["message"]["content"]
+            improved_translation = output.strip()
+            improved_translations.append(improved_translation)
+            # Add sentence pair to the list
+            sentence_pairs.append([source, target, improved_translation, reference])
+        # Calculate ChrF scores
+        beam_chrf_scores = [
+            calculate_chrf(beam_translation, reference)
+            for beam_translation, reference in zip(beam_search_translations, reference_translations)
+        ]
+        improved_chrf_scores = [
+            calculate_chrf(improved_translation, reference)
+            for improved_translation, reference in zip(improved_translations, reference_translations)
+        ]
+        # Calculate COMET scores
+        beam_comet_scores = calculate_comet(source_sentences, beam_search_translations, reference_translations)
+        improved_comet_scores = calculate_comet(source_sentences, improved_translations, reference_translations)
+        # Calculate average scores
+        average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
+        average_improved_chrf = sum(improved_chrf_scores) / len(improved_chrf_scores)
+        average_beam_comet = sum(beam_comet_scores) / len(beam_comet_scores)
+        average_improved_comet = sum(improved_comet_scores) / len(improved_comet_scores)
+        # Calculate score changes
+        chrf_change = average_improved_chrf - average_beam_chrf
+        comet_change = average_improved_comet - average_beam_comet
+        # Prepare dataframes
+        sentence_pairs_df = sentence_pairs
+        scores_df = [
+            ["ChrF", round(average_beam_chrf, 2), round(average_improved_chrf, 2), round(chrf_change, 2)],
+            ["COMET", round(average_beam_comet, 2), round(average_improved_comet, 2), round(comet_change, 2)]
+        ]
+        # Return dataframes and evaluation message without division
+        evaluation_message = f"ChrF Change: {chrf_change:.2f}, COMET Change: {comet_change:.2f}"
+        return sentence_pairs_df, scores_df, evaluation_message
+    except Exception as e:
+        return [], [], f"Error: {str(e)}"
 # Gradio interface
 app = gr.Interface(
     description="Improve translations using GPT-4 and evaluate the results with ChrF and COMET."
 )
 if __name__ == "__main__":
+    app.launch(server_name="0.0.0.0", server_port=7860)