NuExtract-2.0-2B / tokenizer_config.json

Update tokenizer_config.json

96fbce5 verified 5 days ago

10.1 kB

	{
	"_commit_hash": null,
	"add_bos_token": false,
	"add_prefix_space": false,
	"added_tokens_decoder": {
	"151643": {
	"content": "<\|endoftext\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151644": {
	"content": "<\|im_start\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151645": {
	"content": "<\|im_end\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151646": {
	"content": "<\|object_ref_start\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151647": {
	"content": "<\|object_ref_end\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151648": {
	"content": "<\|box_start\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151649": {
	"content": "<\|box_end\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151650": {
	"content": "<\|quad_start\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151651": {
	"content": "<\|quad_end\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151652": {
	"content": "<\|vision_start\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151653": {
	"content": "<\|vision_end\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151654": {
	"content": "<\|vision_pad\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151655": {
	"content": "<\|image_pad\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151656": {
	"content": "<\|video_pad\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": true
	},
	"151657": {
	"content": "<tool_call>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151658": {
	"content": "</tool_call>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151659": {
	"content": "<\|fim_prefix\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151660": {
	"content": "<\|fim_middle\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151661": {
	"content": "<\|fim_suffix\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151662": {
	"content": "<\|fim_pad\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151663": {
	"content": "<\|repo_name\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	},
	"151664": {
	"content": "<\|file_sep\|>",
	"lstrip": false,
	"normalized": false,
	"rstrip": false,
	"single_word": false,
	"special": false
	}
	},
	"additional_special_tokens": [
	"<\|im_start\|>",
	"<\|im_end\|>",
	"<\|object_ref_start\|>",
	"<\|object_ref_end\|>",
	"<\|box_start\|>",
	"<\|box_end\|>",
	"<\|quad_start\|>",
	"<\|quad_end\|>",
	"<\|vision_start\|>",
	"<\|vision_end\|>",
	"<\|vision_pad\|>",
	"<\|image_pad\|>",
	"<\|video_pad\|>"
	],
	"bos_token": null,
	"chat_template": "{%- set image_placeholder = '<\|vision_start\|><\|image_pad\|><\|vision_end\|>' -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'user' -%}\n {%- if loop.first and message['role'] != 'system' -%}\n {{- '<\|im_start\|>system\n' -}}\n {%- if template -%}\n {#--- If template, extraction task ---#}\n {{- 'You are NuExtract, an information extraction tool created by NuMind.' -}}\n {%- else -%}\n {#--- Else, template generation task ---#}\n {{- 'You are a helpful assistant.' -}}\n {%- endif -%}\n {{ '<\|im_end\|>\n' }}\n {%- endif -%}\n {{- '<\|im_start\|>' + message['role'] + '\n' -}}\n {%- if template -%}\n {#--- Template Section ---#}\n {{- '# Template:\n' -}}\n {{- template -}}\n {{- '\n' -}}\n \n {%- if examples -%}\n {#--- Examples can only exist in the extraction task ---#}\n {{- '# Examples:\n' -}}\n {%- for example in examples -%}\n {{- '## Input:\n' -}}\n {%- if example['input'] is mapping and (example['input']['type'] == 'image' or example['input']['type'] == 'image_url') -%}\n {{- image_placeholder \| trim -}}\n {%- elif example['input'] == '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder \| trim -}}\n {%- else -%}\n {#--- Text input example ---#}\n {{- example['input'] -}}\n {%- endif -%}\n {{- '\n' -}}\n {{- '## Output:\n' -}}\n {{- example['output'] -}}\n {{- '\n' -}}\n {%- endfor -%}\n {%- endif -%}\n {{- '# Context:\n' -}}\n {%- endif -%}\n \n {%- if message['content'] is string -%}\n {#--- Simple string content ---#}\n {{- message['content'] \| trim -}}\n {%- elif message['content'] is mapping and (message['content']['type'] == 'image' or message['content']['type'] == 'image_url') -%}\n {{- image_placeholder \| trim -}}\n {%- else -%}\n {#--- List of content items (mixed text/images) ---#}\n {#--- First, determine what the actual input content is (not ICL images) ---#}\n {%- set ns = namespace(has_text_input=false, text_content='') -%}\n \n {#--- Count content types and identify actual input document ---#}\n {%- for content in message['content'] -%}\n {%- if content is mapping and content.get('type') == 'text' -%}\n {%- if content.get('text') != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content['text'] -%}\n {%- endif -%}\n {%- elif content is string -%}\n {%- if content != '<image>' -%}\n {#--- Keep compatibility with <image> for now ---#}\n {%- set ns.has_text_input = true -%}\n {%- set ns.text_content = content -%}\n {%- endif -%}\n {%- endif -%}\n {%- endfor -%}\n \n {#--- Determine what to output based on actual input type ---#}\n {%- if ns.has_text_input -%}\n {#--- Main input is text, so output the text content ---#}\n {{- ns.text_content \| trim -}}\n {%- else -%}\n {#--- Main input is image or <image> placeholder ---#}\n {%- set ns2 = namespace(found_image=false) -%}\n {%- for content in message['content'] -%}\n {%- if content is mapping and (content.get('type') == 'image' or content.get('type') == 'image_url') and not ns2.found_image -%}\n {{- image_placeholder \| trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is mapping and content.get('type') == 'text' and content.get('text') == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder \| trim -}}\n {%- set ns2.found_image = true -%}\n {%- elif content is string and content == '<image>' and not ns2.found_image -%}\n {#--- Keep compatibility with <image> for now ---#}\n {{- image_placeholder \| trim -}}\n {%- set ns2.found_image = true -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n {{- '<\|im_end\|>\n'}}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{- '<\|im_start\|>assistant\n' -}}\n{%- endif -%}",
	"clean_up_tokenization_spaces": false,
	"eos_token": "<\|im_end\|>",
	"errors": "replace",
	"extra_special_tokens": {},
	"max_length": null,
	"max_pixels": 23000000,
	"min_pixels": 200704,
	"model_max_length": 131072,
	"pad_to_multiple_of": null,
	"pad_token": "<\|endoftext\|>",
	"pad_token_type_id": 0,
	"padding_side": "right",
	"processor_class": "Qwen2_5_VLProcessor",
	"split_special_tokens": false,
	"tokenizer_class": "Qwen2Tokenizer",
	"unk_token": null
	}