Update README.md
Browse files
README.md
CHANGED
|
@@ -22,12 +22,13 @@ Replace usernames and links for placeholders: "@user" and "http".
|
|
| 22 |
If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
|
| 23 |
```python
|
| 24 |
def preprocess(text):
|
| 25 |
-
|
| 26 |
-
for t in text.split(
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
```
|
| 32 |
|
| 33 |
## Example Masked Language Model
|
|
@@ -39,8 +40,8 @@ MODEL = "cardiffnlp/twitter-roberta-base-sep2021"
|
|
| 39 |
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
|
| 40 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 41 |
|
| 42 |
-
def
|
| 43 |
-
for i in range(
|
| 44 |
token = tokenizer.decode(candidates[i]['token'])
|
| 45 |
score = candidates[i]['score']
|
| 46 |
print("%d) %.5f %s" % (i+1, score, token))
|
|
@@ -50,11 +51,12 @@ texts = [
|
|
| 50 |
"I keep forgetting to bring a <mask>.",
|
| 51 |
"Looking forward to watching <mask> Game tonight!",
|
| 52 |
]
|
|
|
|
| 53 |
for text in texts:
|
| 54 |
t = preprocess(text)
|
| 55 |
print(f"{'-'*30}\n{t}")
|
| 56 |
candidates = fill_mask(t)
|
| 57 |
-
|
| 58 |
```
|
| 59 |
|
| 60 |
Output:
|
|
@@ -90,13 +92,12 @@ import numpy as np
|
|
| 90 |
from scipy.spatial.distance import cosine
|
| 91 |
from collections import Counter
|
| 92 |
|
| 93 |
-
def get_embedding(text):
|
| 94 |
text = preprocess(text)
|
| 95 |
encoded_input = tokenizer(text, return_tensors='pt')
|
| 96 |
features = model(**encoded_input)
|
| 97 |
features = features[0].detach().cpu().numpy()
|
| 98 |
-
|
| 99 |
-
return features_mean
|
| 100 |
|
| 101 |
|
| 102 |
MODEL = "cardiffnlp/twitter-roberta-base-sep2021"
|
|
|
|
| 22 |
If you're interested in retaining verified users which were also retained during training, you may keep the users listed [here](https://github.com/cardiffnlp/timelms/tree/main/data).
|
| 23 |
```python
|
| 24 |
def preprocess(text):
|
| 25 |
+
preprocessed_text = []
|
| 26 |
+
for t in text.split():
|
| 27 |
+
if len(t) > 1:
|
| 28 |
+
t = '@user' if t[0] == '@' and t.count('@') == 1 else t
|
| 29 |
+
t = 'http' if t.startswith('http') else t
|
| 30 |
+
preprocessed_text.append(t)
|
| 31 |
+
return ' '.join(preprocessed_text)
|
| 32 |
```
|
| 33 |
|
| 34 |
## Example Masked Language Model
|
|
|
|
| 40 |
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
|
| 41 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 42 |
|
| 43 |
+
def pprint(candidates, n):
|
| 44 |
+
for i in range(n):
|
| 45 |
token = tokenizer.decode(candidates[i]['token'])
|
| 46 |
score = candidates[i]['score']
|
| 47 |
print("%d) %.5f %s" % (i+1, score, token))
|
|
|
|
| 51 |
"I keep forgetting to bring a <mask>.",
|
| 52 |
"Looking forward to watching <mask> Game tonight!",
|
| 53 |
]
|
| 54 |
+
|
| 55 |
for text in texts:
|
| 56 |
t = preprocess(text)
|
| 57 |
print(f"{'-'*30}\n{t}")
|
| 58 |
candidates = fill_mask(t)
|
| 59 |
+
pprint(candidates, 5)
|
| 60 |
```
|
| 61 |
|
| 62 |
Output:
|
|
|
|
| 92 |
from scipy.spatial.distance import cosine
|
| 93 |
from collections import Counter
|
| 94 |
|
| 95 |
+
def get_embedding(text): # naive approach for demonstration
|
| 96 |
text = preprocess(text)
|
| 97 |
encoded_input = tokenizer(text, return_tensors='pt')
|
| 98 |
features = model(**encoded_input)
|
| 99 |
features = features[0].detach().cpu().numpy()
|
| 100 |
+
return np.mean(features[0], axis=0)
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
MODEL = "cardiffnlp/twitter-roberta-base-sep2021"
|