parserPDF / utils /config.ini
semmyk's picture
initial commit
0fd441a
raw
history blame
4.85 kB
[marker]
provider=openai
#model_id=openai/gpt-oss-120b
## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
## Note that Marker works pretty well using it's own transformer-based model without LLM
model_id=meta-llama/Llama-4-Maverick-17B-128E-Instruct
hf_provider=fireworks-ai
endpoint_url=""
backend_choice=provider
system_message=""
max_tokens=8192
temperature=0.2
top_p=0.2
stream=True
api_token=a1b2c3
openai_model=openai/gpt-oss-120b
openai_api_key=a1b2c3
openai_base_url=https://router.huggingface.co/v1
openai_image_format=webp
#max_retries=3
#[Configuration]
use_llm=True
output_format=markdown
input_dir=inputs
output_dir=output_md
max_workers=4
max_retries=2
extract_images=True
output_image_format=png
output_encoding=utf-8
debug_data_folder=debug_data
[unsure]
image_output_dir="images"
image_output_format="png"
base_dir=Path(__file__).resolve().parent.parent
###
# Create a Path object from the current file's location, resolve it to an absolute path,
# and then get its parent's parent using chained .parent calls or the parents[] attribute.
#grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
###
[libraries]
libobject_path = C:\\Dat\\dev\\gtk3-runtime\\bin
# from config.ini ##SMY: future plan to merge
[MARKER_CAP]
#[marker]
PROVIDER = openai
#MODEL_ID = openai/gpt-oss-120b
## Marker will return "LLM did not return a valid response" if model is not 'Image-Text-to-Text'
## because of OpenAI inference failed: Errorcode: 400 ... "Unsupported ChatMessageContent type: image_url"
## Note that Marker works pretty well using it's own transformer-based model without LLM
MODEL_ID=meta-llama/Llama-4-Maverick-17B-128E-Instruct
HF_PROVIDER = fireworks-ai
ENDPOINT_URL = ""
BACKEND_CHOiCE = provider
SYSTEM_MESSAGE = ""
MAX_TOKENS = 8192
TEMMPERATURE = 0.2
TOP_P = 0.2
STREAM = True
API_TOKEN = a1b2c3
OPENAI_MODEL = openai/gpt-oss-120b
OPENAI_API_KEY = a1b2c3
OPENAI_BASE_URL = https://router.huggingface.co/v1
OPENAI_IMAGE_FORMAT = webp
#[CONFIGURATION]
MAX_WORKERS = 4
MAX_RETRIES = 2
OUTPUT_FORMAT = markdown
INPUT_DIR = inputs
OUTPUT_DIR = output_dir
USE_LLM = False
EXTRACT_IMAGES = True
OUTPUT_IMAGE_FORMAT = png
OUTPUT_ENCODING = utf-8
DEBUG_DATA_FOLDER = debug_data
[UNSURE_CAP]
IMAGE_OUTPUT_DIR = images
IMAGE_OUTPUT_FORMAT = png
BASE_DIR = Path(__file__).resolve().parent.parent
###
# Create a Path object from the current file's location, resolve it to an absolute path
# Get its parent's parent using chained .parent calls or the parents[] attribute.
#grandparent_dir = Path(__file__).resolve().parent.parent #os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
###
[LIBRARIES_CAP]
LIBOBJECT_PATH = C:\\Dat\\dev\\gtk3-runtime\\bin
WEASYPRINT_DLL_DIRECTORIES = C:\\Dat\\dev\\gtk3-runtime\\bin
[GLOBAL_CAP]
# Globals within each worker process
HF_MODEL ="openai/gpt-oss-120b"
HF_TOKEN = ""
HF_CLIENT = None
ARTIFACT_DICT = None
PDF_CONVERTER = None
HTML_CONVERTER = None
[marker_dict]
## "meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
provider:"openai" #provider,
model_id:"openai/gpt-oss-120b" #model_id, #"meta-llama/Llama-4-Maverick-17B-128E-Instruct:fireworks-ai"
hf_provider:"fireworks-ai" #hf_provider,
endpoint_url:"" #endpoint_url,
backend_choice:"provider" #backend_choice,
system_message:"" #system_message,
max_tokens:8192 #max_tokens,
temperature:0.2 #temperature,
top_p:0.2 #top_p,
stream:"stream"
api_token:"a1b2c3" #get_token,
output_format:"markdown" #output_format, #"markdown",
openai_model:"openai/gpt-oss-120b" #self.client.model_id, #"model_name"
openai_api_key:"a1b2c3" #self.client.openai_api_key, #self.api_token,
openai_base_url:"https://router.huggingface.co/v1" #self.client.base_url, #self.base_url,
#temperature=self.client.temperature,
#top_p=self.client.top_p,
openai_image_format:"webp" #"png" #better compatibility
max_retries:3 ## pass to __call__
[marker_nostrip]
provider="openai"
model_id="openai/gpt-oss-120b"
hf_provider="fireworks-ai"
endpoint_url=""
backend_choice="provider"
system_message=""
max_tokens=8192
temperature=0.2
top_p=0.2
stream=True
api_token="a1b2c3"
openai_model="openai/gpt-oss-120b"
openai_api_key="a1b2c3"
openai_base_url="https://router.huggingface.co/v1"
openai_image_format="webp"
#max_retries=3
#[Configuration]
use_llm=True
output_format="markdown"
input_dir="inputs"
output_dir="output_md"
max_workers=4
max_retries=2
extract_images=True
output_image_format="png"
output_encoding=utf-8
debug_data_folder="debug_data"