mdok detector of machine-generated texts for PAN2025 (available for non-commercial research purpose only - by requesting the access to this model you are agreeing to this condition).
More info, as well as the training code is available in repo.
Usage
The model is fine-tuned Qwen3-14B-Base; therefore, use the latest transformers library supporting it.
Assuming you have the texts for evaluation loaded in the Pandas dataframe 'test_df' in the column 'text'. Firstly, anonymize the texts (not necessary, but the model has been trained in this way). Then, run the inference.
def preprocess(text):
EMAIL_PATTERN = re.compile(r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b") # e.g., [email protected]
USER_MENTION_PATTERN = re.compile(r"@[A-Za-z0-9_-]+") # e.g., @my_username
PHONE_PATTERN = re.compile(r"(\+?\d{1,3})?[\s\*\.-]?\(?\d{1,4}\)?[\s\*\.-]?\d{2,4}[\s\*\.-]?\d{2,6}") #modified from https://stackabuse.com/python-regular-expressions-validate-phone-numbers/
text = re.sub(EMAIL_PATTERN, "[EMAIL]", text)
text = re.sub(USER_MENTION_PATTERN, "[USER]", text)
text = re.sub(PHONE_PATTERN, " [PHONE]", text).replace(' [PHONE]', ' [PHONE]')
return text.lower().strip()
def preprocess_function(examples, **fn_kwargs):
return fn_kwargs['tokenizer'](examples["text"], truncation=True, max_length=512)
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
probs = predictions[:,1]
predictions = np.argmax(predictions, axis=1)
results = {"AUC": roc_auc_score(labels, probs), "ACC": accuracy_score(labels, predictions), "MacroF1": f1_score(labels, predictions, average='macro'), "MAE": mean_absolute_error(labels, predictions)}
return results
def test(test_df, model_path, id2label, label2id):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(
model_path, trust_remote_code=True, num_labels=len(label2id), id2label=id2label, label2id=label2id, torch_dtype=torch.float16
)
if tokenizer.pad_token is None:
if tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
else:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
try:
model.config.pad_token_id = tokenizer.get_vocab()[tokenizer.pad_token]
except:
print("Warning: Exception occured while setting pad_token_id")
test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
predictions = trainer.predict(tokenized_test_dataset)
prob_pred = softmax(predictions.predictions, axis=-1)
return prob_pred
test_df['text'] = [preprocess(x) for x in test_df['text']]
probs = test(test_df, "DominikMacko/mdok", {0: "human", 1: "machine"}, {"human": 0, "machine": 1})
Now 'probs[:,1]' contains probabilities of the texts being of machine class. Either use calibration of the classification threshold on you data, or use the default of >0.5 representing "machine" label.
If limited GPU memory, load the model in 4-bit:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
model = AutoModelForSequenceClassification.from_pretrained(model_path, trust_remote_code=True, num_labels=len(label2id), id2label=id2label, label2id=label2id, torch_dtype=torch.float16, quantization_config=bnb_config)
Cite
If you use the model, code, or any information from this repository, please cite the paper(s):
@misc{macko2025mdokkinitrobustlyfinetuned,
title={mdok of {KInIT}: Robustly Fine-tuned {LLM} for Binary and Multiclass {AI}-Generated Text Detection},
author={Dominik Macko},
year={2025},
eprint={2506.01702},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2506.01702},
}
@misc{macko2025increasingrobustnessfinetunedmultilingual,
title={Increasing the Robustness of the Fine-tuned Multilingual Machine-Generated Text Detectors},
author={Dominik Macko and Robert Moro and Ivan Srba},
year={2025},
eprint={2503.15128},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2503.15128},
}