|
|
--- |
|
|
language: en |
|
|
license: apache-2.0 |
|
|
tags: |
|
|
- token-classification |
|
|
- distilbert |
|
|
- ner |
|
|
- message-parsing |
|
|
- natural-language-understanding |
|
|
datasets: |
|
|
- custom |
|
|
metrics: |
|
|
- accuracy |
|
|
- f1 |
|
|
pipeline_tag: token-classification |
|
|
--- |
|
|
|
|
|
# DistilBERT Message Parser 🤖💬 |
|
|
|
|
|
A fine-tuned DistilBERT model for parsing natural language queries to extract **receiver** (person) and **content** (message) information from user requests. |
|
|
|
|
|
## Model Description |
|
|
|
|
|
This model performs token-level classification to identify: |
|
|
- **`person`**: The recipient/receiver of the message |
|
|
- **`content`**: The message content to be sent |
|
|
- **`O`**: Other tokens (Outside) |
|
|
|
|
|
## Use Cases |
|
|
|
|
|
Perfect for virtual assistants, chatbots, and messaging applications that need to understand commands like: |
|
|
- "Send a message to Mom telling her I'll be home late" |
|
|
- "Ask the python teacher when is the next class" |
|
|
- "Text John about tomorrow's meeting" |
|
|
|
|
|
## Quick Start |
|
|
|
|
|
### Installation |
|
|
|
|
|
```bash |
|
|
pip install transformers torch |
|
|
``` |
|
|
|
|
|
### Basic Usage |
|
|
|
|
|
```python |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
import torch |
|
|
|
|
|
# Load model and tokenizer |
|
|
model_name = "AbdellatifZ/distilbert-message-parser" # Replace with your model name |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForTokenClassification.from_pretrained(model_name) |
|
|
|
|
|
# Helper function for word-level predictions |
|
|
def predict_at_word_level(words, model, tokenizer): |
|
|
"""Predict labels at word level (not subword tokens)""" |
|
|
inputs = tokenizer(words, return_tensors="pt", is_split_into_words=True) |
|
|
|
|
|
with torch.no_grad(): |
|
|
logits = model(**inputs).logits |
|
|
predictions = torch.argmax(logits, dim=2) |
|
|
|
|
|
word_labels = [] |
|
|
word_ids = inputs.word_ids() |
|
|
previous_word_idx = None |
|
|
|
|
|
for idx, word_idx in enumerate(word_ids): |
|
|
if word_idx is None: # Special tokens |
|
|
continue |
|
|
if word_idx != previous_word_idx: # First subtoken of each word |
|
|
word_labels.append(predictions[0][idx].item()) |
|
|
previous_word_idx = word_idx |
|
|
|
|
|
return word_labels |
|
|
|
|
|
# Main parsing function |
|
|
def parse_message(query, model, tokenizer): |
|
|
""" |
|
|
Parse a query to extract receiver and content. |
|
|
|
|
|
Args: |
|
|
query (str): User query in natural language |
|
|
model: Token classification model |
|
|
tokenizer: Tokenizer |
|
|
|
|
|
Returns: |
|
|
dict: {"receiver": str, "content": str} |
|
|
""" |
|
|
words = query.split() |
|
|
label_ids = predict_at_word_level(words, model, tokenizer) |
|
|
|
|
|
id2label = model.config.id2label |
|
|
labels = [id2label[label_id] for label_id in label_ids] |
|
|
|
|
|
person_tokens = [word for word, label in zip(words, labels) if label == 'person'] |
|
|
content_tokens = [word for word, label in zip(words, labels) if label == 'content'] |
|
|
|
|
|
return { |
|
|
'receiver': ' '.join(person_tokens) if person_tokens else None, |
|
|
'content': ' '.join(content_tokens) if content_tokens else None |
|
|
} |
|
|
|
|
|
# Example usage |
|
|
query = "Ask the python teacher when is the next class" |
|
|
result = parse_message(query, model, tokenizer) |
|
|
print(result) |
|
|
# Output: {'receiver': 'the python teacher', 'content': 'when is the next class'} |
|
|
``` |
|
|
|
|
|
## More Examples |
|
|
|
|
|
```python |
|
|
# Example 1: Simple message |
|
|
query = "Send a message to Mom telling her I'll be home late" |
|
|
result = parse_message(query, model, tokenizer) |
|
|
print(result) |
|
|
# {'receiver': 'Mom', 'content': "telling her I'll be home late"} |
|
|
|
|
|
# Example 2: Professional context |
|
|
query = "Write to the professor asking about the exam format" |
|
|
result = parse_message(query, model, tokenizer) |
|
|
print(result) |
|
|
# {'receiver': 'the professor', 'content': 'asking about the exam format'} |
|
|
|
|
|
# Example 3: Casual context |
|
|
query = "Text John asking if he's available for a meeting tomorrow" |
|
|
result = parse_message(query, model, tokenizer) |
|
|
print(result) |
|
|
# {'receiver': 'John', 'content': "asking if he's available for a meeting tomorrow"} |
|
|
``` |
|
|
|
|
|
## Advanced Usage: Batch Processing |
|
|
|
|
|
```python |
|
|
def parse_messages_batch(queries, model, tokenizer): |
|
|
"""Parse multiple queries efficiently""" |
|
|
results = [] |
|
|
for query in queries: |
|
|
result = parse_message(query, model, tokenizer) |
|
|
results.append(result) |
|
|
return results |
|
|
|
|
|
# Batch example |
|
|
queries = [ |
|
|
"Ask the python teacher when is the next class", |
|
|
"Message the customer support about my order status", |
|
|
"Text my friend to see if they're coming tonight" |
|
|
] |
|
|
|
|
|
results = parse_messages_batch(queries, model, tokenizer) |
|
|
for query, result in zip(queries, results): |
|
|
print(f"Query: {query}") |
|
|
print(f"Result: {result}\n") |
|
|
``` |
|
|
|
|
|
## Detailed Token-Level Analysis |
|
|
|
|
|
```python |
|
|
def visualize_parsing(query, model, tokenizer): |
|
|
"""Show word-by-word label predictions""" |
|
|
words = query.split() |
|
|
label_ids = predict_at_word_level(words, model, tokenizer) |
|
|
|
|
|
id2label = model.config.id2label |
|
|
labels = [id2label[label_id] for label_id in label_ids] |
|
|
|
|
|
print(f"\nQuery: {query}\n") |
|
|
print(f"{'Word':<25} {'Label':<10}") |
|
|
print("-" * 35) |
|
|
|
|
|
for word, label in zip(words, labels): |
|
|
print(f"{word:<25} {label:<10}") |
|
|
|
|
|
result = parse_message(query, model, tokenizer) |
|
|
print(f"\n{'='*35}") |
|
|
print(f"Receiver: {result['receiver']}") |
|
|
print(f"Content: {result['content']}") |
|
|
print(f"{'='*35}") |
|
|
|
|
|
# Example |
|
|
visualize_parsing("Ask the python teacher when is the next class", model, tokenizer) |
|
|
``` |
|
|
|
|
|
**Output:** |
|
|
``` |
|
|
Query: Ask the python teacher when is the next class |
|
|
|
|
|
Word Label |
|
|
----------------------------------- |
|
|
Ask O |
|
|
the person |
|
|
python person |
|
|
teacher person |
|
|
when content |
|
|
is content |
|
|
the content |
|
|
next content |
|
|
class content |
|
|
|
|
|
=================================== |
|
|
Receiver: the python teacher |
|
|
Content: when is the next class |
|
|
=================================== |
|
|
``` |
|
|
|
|
|
## API Integration Example |
|
|
|
|
|
```python |
|
|
from flask import Flask, request, jsonify |
|
|
|
|
|
app = Flask(__name__) |
|
|
|
|
|
# Load model once at startup |
|
|
model = AutoModelForTokenClassification.from_pretrained("AbdellatifZ/distilbert-message-parser") |
|
|
tokenizer = AutoTokenizer.from_pretrained("AbdellatifZ/distilbert-message-parser") |
|
|
|
|
|
@app.route('/parse', methods=['POST']) |
|
|
def parse(): |
|
|
data = request.json |
|
|
query = data.get('query', '') |
|
|
|
|
|
if not query: |
|
|
return jsonify({'error': 'No query provided'}), 400 |
|
|
|
|
|
try: |
|
|
result = parse_message(query, model, tokenizer) |
|
|
return jsonify({ |
|
|
'success': True, |
|
|
'query': query, |
|
|
'parsed': result |
|
|
}) |
|
|
except Exception as e: |
|
|
return jsonify({'error': str(e)}), 500 |
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run(debug=True) |
|
|
``` |
|
|
|
|
|
## Model Details |
|
|
|
|
|
| Property | Value | |
|
|
|----------|-------| |
|
|
| Base Model | `distilbert-base-uncased` | |
|
|
| Task | Token Classification (NER-style) | |
|
|
| Number of Labels | 3 (O, content, person) | |
|
|
| Training Framework | Transformers (Hugging Face) | |
|
|
| Parameters | ~67M (DistilBERT) | |
|
|
| Max Sequence Length | 128 tokens | |
|
|
|
|
|
## Training Details |
|
|
|
|
|
### Dataset |
|
|
- Source: Custom Presto-based dataset |
|
|
- Task: Send_message queries |
|
|
- Labels: `person`, `content`, `O` |
|
|
- Split: 70% train, 15% validation, 15% test |
|
|
|
|
|
### Training Configuration |
|
|
- **Epochs**: 15 |
|
|
- **Batch Size**: 16 |
|
|
- **Learning Rate**: 2e-5 |
|
|
- **Optimizer**: AdamW |
|
|
- **Weight Decay**: 0.01 |
|
|
- **Warmup Steps**: 100 |
|
|
|
|
|
### Label Alignment |
|
|
The model uses special label alignment to handle subword tokenization: |
|
|
- Only the first subtoken of each word receives a label |
|
|
- Subsequent subtokens are marked with `-100` (ignored in loss computation) |
|
|
- Special tokens ([CLS], [SEP], [PAD]) are also ignored |
|
|
|
|
|
## Performance |
|
|
|
|
|
| Metric | Value | |
|
|
|--------|-------| |
|
|
| Accuracy | >0.90 | |
|
|
| Precision | >0.88 | |
|
|
| Recall | >0.88 | |
|
|
| F1-Score | >0.88 | |
|
|
|
|
|
*Note: Actual metrics may vary depending on your specific use case and dataset.* |
|
|
|
|
|
## Limitations |
|
|
|
|
|
- **Language**: Optimized for English queries only |
|
|
- **Domain**: Best performance on message-sending commands |
|
|
- **Structure**: May struggle with highly unusual or complex sentence structures |
|
|
- **Context**: Limited to single-turn queries (no conversation context) |
|
|
|
|
|
## Error Handling |
|
|
|
|
|
```python |
|
|
def safe_parse_message(query, model, tokenizer): |
|
|
"""Parse with error handling""" |
|
|
try: |
|
|
if not query or not query.strip(): |
|
|
return {'error': 'Empty query', 'receiver': None, 'content': None} |
|
|
|
|
|
result = parse_message(query, model, tokenizer) |
|
|
|
|
|
# Validate results |
|
|
if not result['receiver'] and not result['content']: |
|
|
return {'warning': 'No entities found', **result} |
|
|
|
|
|
return result |
|
|
|
|
|
except Exception as e: |
|
|
return {'error': str(e), 'receiver': None, 'content': None} |
|
|
|
|
|
# Example |
|
|
result = safe_parse_message("", model, tokenizer) |
|
|
print(result) # {'error': 'Empty query', 'receiver': None, 'content': None} |
|
|
``` |
|
|
|
|
|
## Citation |
|
|
|
|
|
If you use this model in your research, please cite: |
|
|
|
|
|
```bibtex |
|
|
@misc{distilbert-message-parser, |
|
|
author = {Your Name}, |
|
|
title = {DistilBERT Message Parser: Token Classification for Message Intent Extraction}, |
|
|
year = {2025}, |
|
|
publisher = {Hugging Face}, |
|
|
howpublished = {\url{https://huggingface.co/AbdellatifZ/distilbert-message-parser}} |
|
|
} |
|
|
``` |
|
|
|
|
|
## License |
|
|
|
|
|
This model is released under the Apache 2.0 License. |
|
|
|
|
|
## Contact & Feedback |
|
|
|
|
|
For questions, issues, or feedback: |
|
|
- Open an issue on the model repository |
|
|
- Contact: [Your contact information] |
|
|
|
|
|
## Acknowledgments |
|
|
|
|
|
- Base model: [DistilBERT](https://huggingface.co/distilbert-base-uncased) by Hugging Face |
|
|
- Framework: [Transformers](https://github.com/huggingface/transformers) by Hugging Face |
|
|
- Dataset inspiration: Presto benchmark |
|
|
|
|
|
--- |
|
|
|
|
|
**Built with Transformers 🤗** |
|
|
|