File size: 10,962 Bytes
c2ccd5d
489efed
 
c2ccd5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489efed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2ccd5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9672598
c2ccd5d
 
 
 
489efed
 
c2ccd5d
489efed
 
c2ccd5d
489efed
 
 
c2ccd5d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
{
  "_commit_hash": null,
  "add_bos_token": false,
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "chat_template": "{%- set image_placeholder = '<|vision_start|><|image_pad|><|vision_end|>' -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {{- '<|im_start|>system\n' -}}\n        {%- if message['content'] is string -%}\n            {{- message['content'] | trim -}}\n        {%- endif -%}\n        {{- '<|im_end|>\n' -}}\n    {%- elif message['role'] == 'user' -%}\n        {%- if loop.first -%}\n            {{- '<|im_start|>system\n' -}}\n            {%- if template -%}\n                {#--- If template, extraction task ---#}\n                {{- 'You are NuExtract, an information extraction tool created by NuMind.' -}}\n            {%- else -%}\n                {#--- Else, template generation task ---#}\n                {{- 'You are a helpful assistant.' -}}\n            {%- endif -%}\n            {{ '<|im_end|>\n' }}\n        {%- endif -%}\n        {{- '<|im_start|>' + message['role'] + '\n' -}}\n        {%- if template -%}\n            {#--- Template Section ---#}\n            {{- '# Template:\n' -}}\n            {{- template -}}\n            {{- '\n' -}}\n\n            {%- if examples -%}\n                {#--- Examples can only exist in the extraction task ---#}\n                {{- '# Examples:\n' -}}\n                {%- for example in examples -%}\n                    {{- '## Input:\n' -}}\n                    {%- if example['input'] is mapping and (example['input']['type'] == 'image' or example['input']['type'] == 'image_url') -%}\n                        {{- image_placeholder | trim -}}\n                    {%- elif example['input'] == '<image>' -%}\n                        {#--- Keep compatibility with <image> for now ---#}\n                        {{- image_placeholder | trim -}}\n                    {%- else -%}\n                        {#--- Text input example ---#}\n                        {{- example['input'] -}}\n                    {%- endif -%}\n                    {{- '\n' -}}\n                    {{- '## Output:\n' -}}\n                    {{- example['output'] -}}\n                    {{- '\n' -}}\n                {%- endfor -%}\n            {%- endif -%}\n            {{- '# Context:\n' -}}\n        {%- endif -%}\n\n        {%- if message['content'] is string -%}\n            {#--- Simple string content ---#}\n            {{- message['content'] | trim -}}\n        {%- elif message['content'] is mapping and (message['content']['type'] == 'image' or message['content']['type'] == 'image_url') -%}\n            {{- image_placeholder | trim -}}\n        {%- else -%}\n            {#--- List of content items (mixed text/images) ---#}\n            {#--- First, determine what the actual input content is (not ICL images) ---#}\n            {%- set ns = namespace(has_text_input=false, text_content='') -%}\n\n            {#--- Count content types and identify actual input document ---#}\n            {%- for content in message['content'] -%}\n                {%- if content is mapping and content.get('type') == 'text' -%}\n                    {%- if content.get('text') != '<image>' -%}\n                        {#--- Keep compatibility with <image> for now ---#}\n                        {%- set ns.has_text_input = true -%}\n                        {%- set ns.text_content = content['text'] -%}\n                    {%- endif -%}\n                {%- elif content is string -%}\n                    {%- if content != '<image>' -%}\n                        {#--- Keep compatibility with <image> for now ---#}\n                        {%- set ns.has_text_input = true -%}\n                        {%- set ns.text_content = content -%}\n                    {%- endif -%}\n                {%- endif -%}\n            {%- endfor -%}\n\n            {#--- Determine what to output based on actual input type ---#}\n            {%- if ns.has_text_input -%}\n                {#--- Main input is text, so output the text content ---#}\n                {{- ns.text_content | trim -}}\n            {%- else -%}\n                {#--- Main input is image or <image> placeholder ---#}\n                {%- set ns2 = namespace(found_image=false) -%}\n                {%- for content in message['content'] -%}\n                    {%- if content is mapping and (content.get('type') == 'image' or content.get('type') == 'image_url') and not ns2.found_image -%}\n                        {{- image_placeholder | trim -}}\n                        {%- set ns2.found_image = true -%}\n                    {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n                        {#--- Keep compatibility with <image> for now ---#}\n                        {{- image_placeholder | trim -}}\n                        {%- set ns2.found_image = true -%}\n                    {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n                        {#--- Keep compatibility with <image> for now ---#}\n                        {{- image_placeholder | trim -}}\n                        {%- set ns2.found_image = true -%}\n                    {%- endif -%}\n                {%- endfor -%}\n            {%- endif -%}\n        {%- endif -%}\n        {{- '<|im_end|>\n'}}\n\n    {%- elif message['role'] == 'assistant' -%}\n        {{- '<|im_start|>assistant\n' -}}\n        {%- if message['content'] is string -%}\n            {{- message['content'] | trim -}}\n        {%- elif message['content'] is iterable and message['content'] is not string -%}\n            {%- for content in message['content'] -%}\n                {%- if content is mapping and content.get('type') == 'text' -%}\n                    {{- content['text'] | trim -}}\n                {%- elif content is string -%}\n                    {{- content | trim -}}\n                {%- endif -%}\n            {%- endfor -%}\n        {%- endif -%}\n        {{- '<|im_end|>\n' -}}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{- '<|im_start|>assistant\n' -}}\n{%- endif -%}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "max_length": null,
  "max_pixels": 23000000,
  "min_pixels": 200704,
  "model_max_length": 131072,
  "pad_to_multiple_of": null,
  "pad_token": "<|endoftext|>",
  "pad_token_type_id": 0,
  "padding_side": "right",
  "processor_class": "Qwen2_5_VLProcessor",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
}