KoichiYasuoka commited on
Commit
23cef86
·
1 Parent(s): 33d9657

initial release

Browse files
Files changed (8) hide show
  1. README.md +29 -0
  2. config.json +0 -0
  3. maker.py +133 -0
  4. pytorch_model.bin +3 -0
  5. special_tokens_map.json +37 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +73 -0
  8. ud.py +197 -0
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "pt"
4
+ tags:
5
+ - "portuguese"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ base_model: eliasjacob/ModernBERT-large-portuguese
10
+ datasets:
11
+ - "universal_dependencies"
12
+ license: "apache-2.0"
13
+ pipeline_tag: "token-classification"
14
+ ---
15
+
16
+ # modernbert-large-portuguese-ud-embeds
17
+
18
+ ## Model Description
19
+
20
+ This is a ModernBERT model for POS-tagging and dependency-parsing, derived from [ModernBERT-large-portuguese](https://huggingface.co/eliasjacob/ModernBERT-large-portuguese).
21
+
22
+ ## How to Use
23
+
24
+ ```py
25
+ from transformers import pipeline
26
+ nlp=pipeline("universal-dependencies","KoichiYasuoka/modernbert-large-portuguese-ud-embeds",trust_remote_code=True)
27
+ print(nlp("Foi quebrado pelo pelo do gato"))
28
+ ```
29
+
config.json ADDED
The diff for this file is too large to render. See raw diff
 
maker.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ import os
3
+ src="eliasjacob/ModernBERT-large-portuguese"
4
+ tgt="KoichiYasuoka/modernbert-large-portuguese-ud-embeds"
5
+ url="https://github.com/UniversalDependencies/UD_Portuguese-"
6
+ for e in ["Bosque","GSD","PetroGold"]:
7
+ u=url+e
8
+ d=os.path.basename(u)
9
+ os.system("test -d "+d+" || git clone --depth=1 "+u)
10
+ s='BEGIN{FS=OFS="\\t";a[1]=a[2]=""};{if(NF==10){if($1~/-/&&$2~/-/)split($1,a,"-");else{if($1==a[1])$10="SpaceAfter=No";else if($1==a[2])$2="-"$2;print}}else{print;a[1]=a[2]=""}}'
11
+ os.system("for F in train dev test ; do nawk '"+s+"' UD_Portuguese-*/*-$F.conllu >$F.conllu ; done")
12
+ class UDEmbedsDataset(object):
13
+ def __init__(self,conllu,tokenizer,embeddings=None):
14
+ self.conllu=open(conllu,"r",encoding="utf-8")
15
+ self.tokenizer=tokenizer
16
+ self.embeddings=embeddings
17
+ self.seeks=[0]
18
+ label=set(["SYM","SYM.","SYM|_"])
19
+ dep=set()
20
+ s=self.conllu.readline()
21
+ while s!="":
22
+ if s=="\n":
23
+ self.seeks.append(self.conllu.tell())
24
+ else:
25
+ w=s.split("\t")
26
+ if len(w)==10:
27
+ if w[0].isdecimal():
28
+ p=w[3]
29
+ q="" if w[5]=="_" else "|"+w[5]
30
+ d=("|" if w[6]=="0" else "|l-" if int(w[0])<int(w[6]) else "|r-")+w[7]
31
+ for k in [p,p+".","B-"+p,"B-"+p+".","I-"+p,"I-"+p+".",p+q+"|_",p+q+d]:
32
+ label.add(k)
33
+ s=self.conllu.readline()
34
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
35
+ def __call__(*args):
36
+ lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
37
+ for t in args:
38
+ t.label2id=lid
39
+ return lid
40
+ def __del__(self):
41
+ self.conllu.close()
42
+ __len__=lambda self:(len(self.seeks)-1)*2
43
+ def __getitem__(self,i):
44
+ self.conllu.seek(self.seeks[int(i/2)])
45
+ z,c,t,s,e,w,m=i%2,[],[""],"_","_",None,False
46
+ while t[0]!="\n":
47
+ t=self.conllu.readline().split("\t")
48
+ if len(t)==10:
49
+ if t[0].isdecimal():
50
+ if m:
51
+ t[1]=" "+t[1]
52
+ if w==None:
53
+ c.append(t)
54
+ m=t[9].find("SpaceAfter=No")
55
+ elif s==t[0]:
56
+ t[1]=w
57
+ t[6]="0"
58
+ c.append(t)
59
+ elif e==t[0]:
60
+ s=e="_"
61
+ w=None
62
+ elif z==0:
63
+ k=t[0].split("-")
64
+ if len(k)==2:
65
+ s,e=k
66
+ w=" "+t[1] if m else t[1]
67
+ m=t[9].find("SpaceAfter=No")
68
+ x=[True if t[6]=="0" or int(t[6])>int(t[0]) or sum([1 if int(c[i][6])==int(t[0]) else 0 for i in range(j+1,len(c))])>0 else False for j,t in enumerate(c)]
69
+ v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
70
+ if z==0:
71
+ ids,upos=[self.tokenizer.cls_token_id],["SYM."]
72
+ for i,(j,k) in enumerate(zip(v,c)):
73
+ if j==[]:
74
+ j=[self.tokenizer.unk_token_id]
75
+ p=k[3] if x[i] else k[3]+"."
76
+ ids+=j
77
+ upos+=[p] if len(j)==1 else ["B-"+p]+["I-"+p]*(len(j)-1)
78
+ ids.append(self.tokenizer.sep_token_id)
79
+ upos.append("SYM.")
80
+ emb=self.embeddings
81
+ else:
82
+ import torch
83
+ if len(x)<127:
84
+ x=[True]*len(x)
85
+ w=(len(x)+2)*(len(x)+1)/2
86
+ else:
87
+ w=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
88
+ for i in range(len(x)):
89
+ if x[i]==False and w+len(x)-i<8192:
90
+ x[i]=True
91
+ w+=len(x)-i+1
92
+ p=[t[3] if t[5]=="_" else t[3]+"|"+t[5] for i,t in enumerate(c)]
93
+ d=[t[7] if t[6]=="0" else "l-"+t[7] if int(t[0])<int(t[6]) else "r-"+t[7] for t in c]
94
+ ids,upos=[-1],["SYM|_"]
95
+ for i in range(len(x)):
96
+ if x[i]:
97
+ ids.append(i)
98
+ upos.append(p[i]+"|"+d[i] if c[i][6]=="0" else p[i]+"|_")
99
+ for j in range(i+1,len(x)):
100
+ ids.append(j)
101
+ upos.append(p[j]+"|"+d[j] if int(c[j][6])==i+1 else p[i]+"|"+d[i] if int(c[i][6])==j+1 else p[j]+"|_")
102
+ if w>8192 and i>0:
103
+ while w>8192 and upos[-1].endswith("|_"):
104
+ upos.pop(-1)
105
+ ids.pop(-1)
106
+ w-=1
107
+ ids.append(-1)
108
+ upos.append("SYM|_")
109
+ with torch.no_grad():
110
+ m=[]
111
+ for j in v:
112
+ if j==[]:
113
+ j=[self.tokenizer.unk_token_id]
114
+ m.append(self.embeddings[j,:].sum(axis=0))
115
+ m.append(self.embeddings[self.tokenizer.sep_token_id,:])
116
+ emb=torch.stack(m)
117
+ return{"inputs_embeds":emb[ids[:8192],:],"labels":[self.label2id[p] for p in upos[:8192]]}
118
+ from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
119
+ from tokenizers.pre_tokenizers import Sequence,Punctuation
120
+ tkz=AutoTokenizer.from_pretrained(src)
121
+ tkz.backend_tokenizer.pre_tokenizer=Sequence([Punctuation(),tkz.backend_tokenizer.pre_tokenizer])
122
+ trainDS=UDEmbedsDataset("train.conllu",tkz)
123
+ devDS=UDEmbedsDataset("dev.conllu",tkz)
124
+ testDS=UDEmbedsDataset("test.conllu",tkz)
125
+ lid=trainDS(devDS,testDS)
126
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
127
+ mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
128
+ trainDS.embeddings=mdl.get_input_embeddings().weight
129
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=1,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
130
+ trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
131
+ trn.train()
132
+ trn.save_model(tgt)
133
+ tkz.save_pretrained(tgt)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc56e811d16418f8000018ef672be70765f5e2ed897412183467c091d7290d92
3
+ size 1598942274
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": true,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|padding|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[UNK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[PAD]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[MASK]",
53
+ "lstrip": true,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "clean_up_tokenization_spaces": true,
61
+ "cls_token": "[CLS]",
62
+ "extra_special_tokens": {},
63
+ "mask_token": "[MASK]",
64
+ "model_input_names": [
65
+ "input_ids",
66
+ "attention_mask"
67
+ ],
68
+ "model_max_length": 8192,
69
+ "pad_token": "[PAD]",
70
+ "sep_token": "[SEP]",
71
+ "tokenizer_class": "PreTrainedTokenizerFast",
72
+ "unk_token": "[UNK]"
73
+ }
ud.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy
2
+ from transformers import TokenClassificationPipeline
3
+
4
+ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
5
+ def __init__(self,**kwargs):
6
+ super().__init__(**kwargs)
7
+ x=self.model.config.label2id
8
+ y=[k for k in x if k.find("|")<0 and not k.startswith("I-")]
9
+ self.transition=numpy.full((len(x),len(x)),-numpy.inf)
10
+ self.ilabel=numpy.full(len(x),-numpy.inf)
11
+ self.slabel=numpy.full(len(x),-numpy.inf)
12
+ for k,v in x.items():
13
+ if k.find("|")<0:
14
+ for j in ["I-"+k[2:]] if k.startswith("B-") else [k]+y if k.startswith("I-") else y:
15
+ self.transition[v,x[j]]=0
16
+ if k.startswith("I-"):
17
+ self.ilabel[v]=0
18
+ elif k.startswith("SYM"):
19
+ self.slabel[v]=0
20
+ def check_model_type(self,supported_models):
21
+ pass
22
+ def postprocess(self,model_outputs,**kwargs):
23
+ if "logits" not in model_outputs:
24
+ return self.postprocess(model_outputs[0],**kwargs)
25
+ return self.bellman_ford_token_classification(model_outputs,**kwargs)
26
+ def bellman_ford_token_classification(self,model_outputs,**kwargs):
27
+ m=model_outputs["logits"][0].numpy()
28
+ x=model_outputs["offset_mapping"][0].tolist()
29
+ for i,(s,e) in enumerate(x):
30
+ if s==0 and e==0:
31
+ m[i]+=self.slabel
32
+ elif i>0 and s<e and x[i-1][1]>s:
33
+ m[i]+=self.ilabel
34
+ e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
35
+ z=e/e.sum(axis=-1,keepdims=True)
36
+ for i in range(m.shape[0]-1,0,-1):
37
+ m[i-1]+=numpy.max(m[i]+self.transition,axis=1)
38
+ k=[numpy.argmax(m[0]+self.transition[0])]
39
+ for i in range(1,m.shape[0]):
40
+ k.append(numpy.argmax(m[i]+self.transition[k[-1]]))
41
+ w=[{"entity":self.model.config.id2label[j],"start":s,"end":e,"score":z[i,j]} for i,((s,e),j) in enumerate(zip(x,k)) if s<e]
42
+ if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
43
+ for i,t in reversed(list(enumerate(w))):
44
+ p=t.pop("entity")
45
+ if i>0 and p.startswith("I-"):
46
+ w[i-1]["score"]=min(w[i-1]["score"],t["score"])
47
+ w[i-1]["end"]=w.pop(i)["end"]
48
+ elif i>0 and w[i-1]["end"]>t["start"]:
49
+ w[i-1]["score"]=min(w[i-1]["score"],t["score"])
50
+ w[i-1]["end"]=w.pop(i)["end"]
51
+ elif p.startswith("B-"):
52
+ t["entity_group"]=p[2:]
53
+ else:
54
+ t["entity_group"]=p
55
+ for t in w:
56
+ t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
57
+ return w
58
+
59
+ class UniversalDependenciesPipeline(BellmanFordTokenClassificationPipeline):
60
+ def __init__(self,**kwargs):
61
+ kwargs["aggregation_strategy"]="simple"
62
+ super().__init__(**kwargs)
63
+ x=self.model.config.label2id
64
+ self.root=numpy.full((len(x)),-numpy.inf)
65
+ self.left_arc=numpy.full((len(x)),-numpy.inf)
66
+ self.right_arc=numpy.full((len(x)),-numpy.inf)
67
+ for k,v in x.items():
68
+ if k.endswith("|root"):
69
+ self.root[v]=0
70
+ elif k.find("|l-")>0:
71
+ self.left_arc[v]=0
72
+ elif k.find("|r-")>0:
73
+ self.right_arc[v]=0
74
+ self.multiword={}
75
+ if self.model.config.task_specific_params:
76
+ if "upos_multiword" in self.model.config.task_specific_params:
77
+ self.multiword=self.model.config.task_specific_params["upos_multiword"]
78
+ def postprocess(self,model_outputs,**kwargs):
79
+ import torch
80
+ kwargs["aggregation_strategy"]="simple"
81
+ if "logits" not in model_outputs:
82
+ return self.postprocess(model_outputs[0],**kwargs)
83
+ w=self.bellman_ford_token_classification(model_outputs,**kwargs)
84
+ off=[(t["start"],t["end"]) for t in w]
85
+ for i,(s,e) in reversed(list(enumerate(off))):
86
+ if s<e:
87
+ d=w[i]["text"]
88
+ j=len(d)-len(d.lstrip())
89
+ if j>0:
90
+ d=d.lstrip()
91
+ off[i]=(off[i][0]+j,off[i][1])
92
+ j=len(d)-len(d.rstrip())
93
+ if j>0:
94
+ d=d.rstrip()
95
+ off[i]=(off[i][0],off[i][1]-j)
96
+ if d.strip()=="":
97
+ off.pop(i)
98
+ w.pop(i)
99
+ else:
100
+ p=w[i]["entity_group"]
101
+ if p in self.multiword:
102
+ d=d.lower()
103
+ if d in self.multiword[p]:
104
+ j=self.multiword[p][d]
105
+ if "".join(j)==d:
106
+ for k in reversed(j[1:]):
107
+ e=off[i][1]
108
+ w.insert(i+1,{"start":e-len(k),"end":e,"text":k,"entity_group":"","score":w[i]["score"]})
109
+ off.insert(i+1,(e-len(k),e))
110
+ w[i]["end"]=e-len(k)
111
+ off[i]=(off[i][0],e-len(k))
112
+ w[i]["text"]=" "+j[0]
113
+ w[i]["entity_group"]=""
114
+ else:
115
+ s,e=off[i]
116
+ for k in reversed(j[1:]):
117
+ w.insert(i+1,{"start":s,"end":e,"text":" "+k,"entity_group":"+","score":w[i]["score"]})
118
+ off.insert(i+1,(s,e))
119
+ w[i]["text"]=" "+j[0]
120
+ w[i]["entity_group"]=f"+{len(j)}"
121
+ v=self.tokenizer([t["text"] for t in w],add_special_tokens=False)
122
+ x=[not t["entity_group"].endswith(".") for t in w]
123
+ if len(x)<127:
124
+ x=[True]*len(x)
125
+ else:
126
+ k=sum([len(x)-i+1 if b else 0 for i,b in enumerate(x)])+1
127
+ for i in numpy.argsort(numpy.array([t["score"] for t in w])):
128
+ if x[i]==False and k+len(x)-i<8192:
129
+ x[i]=True
130
+ k+=len(x)-i+1
131
+ ids=[-1]
132
+ for i in range(len(x)):
133
+ if x[i]:
134
+ ids.append(i)
135
+ for j in range(i+1,len(x)):
136
+ ids.append(j)
137
+ ids.append(-1)
138
+ with torch.no_grad():
139
+ e=self.model.get_input_embeddings().weight
140
+ m=[]
141
+ for j in v["input_ids"]:
142
+ if j==[]:
143
+ j=[self.tokenizer.unk_token_id]
144
+ m.append(e[j,:].sum(axis=0))
145
+ m.append(e[self.tokenizer.sep_token_id,:])
146
+ m=torch.stack(m).to(self.device)
147
+ e=self.model(inputs_embeds=torch.unsqueeze(m[ids,:],0))
148
+ m=e.logits[0].cpu().numpy()
149
+ e=numpy.full((len(x),len(x),m.shape[-1]),m.min())
150
+ k=1
151
+ for i in range(len(x)):
152
+ if x[i]:
153
+ e[i,i]=m[k]+self.root
154
+ k+=1
155
+ for j in range(1,len(x)-i):
156
+ e[i+j,i]=m[k]+self.left_arc
157
+ e[i,i+j]=m[k]+self.right_arc
158
+ k+=1
159
+ k+=1
160
+ m,p=numpy.max(e,axis=2),numpy.argmax(e,axis=2)
161
+ h=self.chu_liu_edmonds(m)
162
+ z=[i for i,j in enumerate(h) if i==j]
163
+ if len(z)>1:
164
+ k,h=z[numpy.argmax(m[z,z])],numpy.min(m)-numpy.max(m)
165
+ m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
166
+ h=self.chu_liu_edmonds(m)
167
+ q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
168
+ t=model_outputs["sentence"].replace("\n"," ")
169
+ u="# text = "+t+"\n"
170
+ for i,(s,e) in enumerate(off):
171
+ m=w[i]["entity_group"]
172
+ if m.startswith("+"):
173
+ if m!="+":
174
+ u+="\t".join([f"{i+1}-{i+int(m)}",t[s:e],"_","_","_","_","_","_","_","_" if i+int(m)<len(off) and e<off[i+int(m)][0] else "SpaceAfter=No"])+"\n"
175
+ u+="\t".join([str(i+1),w[i]["text"].strip(),"_",q[i][0],"_","_" if len(q[i])<3 else "|".join(q[i][1:-1]),str(0 if h[i]==i else h[i]+1),"root" if q[i][-1]=="root" else q[i][-1][2:],"_","_"])+"\n"
176
+ else:
177
+ u+="\t".join([str(i+1),t[s:e],"_",q[i][0],"_","_" if len(q[i])<3 else "|".join(q[i][1:-1]),str(0 if h[i]==i else h[i]+1),"root" if q[i][-1]=="root" else q[i][-1][2:],"_","_" if i+1<len(off) and e<off[i+1][0] else "SpaceAfter=No"])+"\n"
178
+ return u+"\n"
179
+ def chu_liu_edmonds(self,matrix):
180
+ h=numpy.argmax(matrix,axis=0)
181
+ x=[-1 if i==j else j for i,j in enumerate(h)]
182
+ for b in [lambda x,i,j:-1 if i not in x else x[i],lambda x,i,j:-1 if j<0 else x[j]]:
183
+ y=[]
184
+ while x!=y:
185
+ y=list(x)
186
+ for i,j in enumerate(x):
187
+ x[i]=b(x,i,j)
188
+ if max(x)<0:
189
+ return h
190
+ y,x=[i for i,j in enumerate(x) if j==max(x)],[i for i,j in enumerate(x) if j<max(x)]
191
+ z=matrix-numpy.max(matrix,axis=0)
192
+ m=numpy.block([[z[x,:][:,x],numpy.max(z[x,:][:,y],axis=1).reshape(len(x),1)],[numpy.max(z[y,:][:,x],axis=0),numpy.max(z[y,y])]])
193
+ k=[j if i==len(x) else x[j] if j<len(x) else y[numpy.argmax(z[y,x[i]])] for i,j in enumerate(self.chu_liu_edmonds(m))]
194
+ h=[j if i in y else k[x.index(i)] for i,j in enumerate(h)]
195
+ i=y[numpy.argmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
196
+ h[i]=x[k[-1]] if k[-1]<len(x) else i
197
+ return h