semmyk commited on
Commit
08fe9f3
·
1 Parent(s): a71344a

baseline08_beta0.3.4_03Oct25: fixing slow Marker PdfConverter() ; moved @spaces.GPU to pool.map() ; - minor tweaks/fixes

Browse files
converters/extraction_converter.py CHANGED
@@ -168,6 +168,8 @@ class DocumentConverter:
168
  #config=config_parser.generate_config_dict(),
169
  #llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
170
  llm_service=llm_service_str, ##resolve
 
 
171
  )
172
 
173
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
 
168
  #config=config_parser.generate_config_dict(),
169
  #llm_service=self.llm_service ##SMY expecting str but self.llm_service, is service object marker.services of type BaseServices
170
  llm_service=llm_service_str, ##resolve
171
+ processor_list=config_parser.get_processors(),
172
+ renderer=config_parser.get_renderer(),
173
  )
174
 
175
  logger.log(level=20, msg="✔️ MarkerConverter instantiated successfully:", extra={"converter.config": str(self.converter.config.get("openai_base_url")), "use_llm":self.converter.use_llm})
requirements.txt CHANGED
@@ -6,8 +6,8 @@ gradio>=5.44.0 # gradio[mcp]>=5.44.0
6
 
7
  ## HF Spaces recommendation: https://huggingface.co/docs/hub/spaces-gpus#frameworks
8
  --extra-index-url https://download.pytorch.org/whl/cu113
9
- torch
10
- #torch>=2.7.1 # ZeroGPU support
11
  spaces>=0.42.1 # HF Spaces (default on HF Spaces
12
  #huggingface_hub>=0.34.0 # HuggingFace integration
13
 
 
6
 
7
  ## HF Spaces recommendation: https://huggingface.co/docs/hub/spaces-gpus#frameworks
8
  --extra-index-url https://download.pytorch.org/whl/cu113
9
+ torch==2.7.0 # torch
10
+ #torch>=2.7.0 # ZeroGPU support
11
  spaces>=0.42.1 # HF Spaces (default on HF Spaces
12
  #huggingface_hub>=0.34.0 # HuggingFace integration
13
 
ui/gradio_ui.py CHANGED
@@ -59,7 +59,7 @@ except Exception as exc:
59
  # pool executor to convert files called by Gradio
60
  ##SMY: TODO: future: refactor to gradio_process.py and
61
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
62
- @spaces.GPU
63
  def convert_batch(
64
  pdf_files, #: list[str],
65
  pdf_files_count: int,
@@ -261,6 +261,7 @@ def convert_batch(
261
  #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
262
  #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
263
  #time.sleep(0.25)'''
 
264
  def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
265
  #Use progress.tqdm to integrate with the executor map
266
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
@@ -683,7 +684,7 @@ def build_interface() -> gr.Blocks:
683
  with gr.Column():
684
  page_range_tb = gr.Textbox(
685
  label="Page Range (Optional)",
686
- value=0,
687
  placeholder="Example: 0,1-5,8,12-15 ~(default: first page)",
688
  lines=1,
689
  max_lines=1,
 
59
  # pool executor to convert files called by Gradio
60
  ##SMY: TODO: future: refactor to gradio_process.py and
61
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
62
+ #@spaces.GPU
63
  def convert_batch(
64
  pdf_files, #: list[str],
65
  pdf_files_count: int,
 
261
  #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
262
  #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
263
  #time.sleep(0.25)'''
264
+ @spaces.GPU ## HF Spaces GPU support
265
  def get_results_pool_map(pdf_files, pdf_files_count, progress2=gr.Progress()):
266
  #Use progress.tqdm to integrate with the executor map
267
  #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
 
684
  with gr.Column():
685
  page_range_tb = gr.Textbox(
686
  label="Page Range (Optional)",
687
+ value="0-0",
688
  placeholder="Example: 0,1-5,8,12-15 ~(default: first page)",
689
  lines=1,
690
  max_lines=1,