| import json | |
| import random | |
| import gradio as gr | |
| from difflib import SequenceMatcher | |
| file_path = "dataset.jsonl" | |
| similarity_threshold = 0.85 | |
| current_index = 0 | |
| description_text = """ | |
| This Space is inspired by [Luis Hunt's](https://www.linkedin.com/posts/louiswhunt_see-below-for-6882-pages-of-mmlu-and-gsm8k-activity-7281011488692047872-fWCE?utm_source=share&utm_medium=member_desktop) post. | |
| He highlights how current top performing models from major vendors are contaminated with benchmark data that is supposed to be used to assess their performance. | |
| This space aims to partially reproduce this work. I chose to look at the contamination of **Qwen/Qwen2.5-14B** by **GSM8K** dataset. | |
| I found **729** GSM8K Example that had a least a 0.9 text similarity ratio between generated an original. | |
| """ | |
| def find_similar_chunks(original, output): | |
| matcher = SequenceMatcher(None, original, output) | |
| left = 0 | |
| highlighted_sequence = [] | |
| for _, j, n in matcher.get_matching_blocks(): | |
| if left < j: | |
| highlighted_sequence.append((output[left:j], None)) | |
| highlighted_sequence.append((output[j:j+n], 1)) | |
| left = j + n | |
| if j+n < len(output) - 1: | |
| highlighted_sequence.append((output[j+n:], None)) | |
| return highlighted_sequence | |
| with open(file_path, "r") as file: | |
| examples = [json.loads(line) for line in file if json.loads(line)["similarity_ratio"] > similarity_threshold] | |
| def next_example(): | |
| new_example = random.choice(examples) | |
| highlighted_output = find_similar_chunks(new_example["original"], new_example["output"]) | |
| return( | |
| [ | |
| new_example["prompt"], | |
| new_example["original"], | |
| highlighted_output, | |
| new_example["similarity_ratio"], | |
| new_example["seed"] | |
| ] | |
| ) | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(description_text) | |
| with gr.Column(scale=1): | |
| pass | |
| prompt = gr.Textbox( | |
| label="Prompt", | |
| interactive=False, | |
| value=examples[current_index]["prompt"], | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| original = gr.Textbox( | |
| label="Original", | |
| interactive=False, | |
| value=examples[current_index]["original"], | |
| ) | |
| with gr.Column(scale=4): | |
| output = gr.HighlightedText( | |
| label="Output", | |
| color_map={"1": "yellow"}, | |
| value=find_similar_chunks(examples[current_index]["original"], | |
| examples[current_index]["output"]), | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| similarity = gr.Textbox( | |
| label="Similarity ratio", | |
| interactive=False, | |
| value=examples[current_index]["similarity_ratio"], | |
| ) | |
| with gr.Column(scale=1): | |
| seed = gr.Textbox( | |
| label="Seed", | |
| interactive=False, | |
| value=examples[current_index]["seed"], | |
| ) | |
| next_btn = gr.Button("Anoter example") | |
| next_btn.click(fn=next_example, | |
| outputs=[prompt, original, output, similarity, seed]) | |
| demo.launch() |