Spaces:
Running
on
Zero
Running
on
Zero
baseline08_beta0.4.1_07Oct25: fix permissions: oauth inference-api, write output markdown
Browse files- README.md +2 -1
- converters/pdf_to_md.py +7 -7
- globals.py +1 -0
- ui/gradio_process.py +126 -60
- ui/gradio_ui.py +11 -3
- utils/file_utils.py +4 -2
README.md
CHANGED
|
@@ -9,7 +9,8 @@ python_version: 3.12
|
|
| 9 |
command: python main.py
|
| 10 |
app_file: main.py
|
| 11 |
hf_oauth: true
|
| 12 |
-
oauth_scopes: [read-access]
|
|
|
|
| 13 |
license: mit
|
| 14 |
pinned: true
|
| 15 |
short_description: PDF & HTML parser to markdown
|
|
|
|
| 9 |
command: python main.py
|
| 10 |
app_file: main.py
|
| 11 |
hf_oauth: true
|
| 12 |
+
#oauth_scopes: [read-access]
|
| 13 |
+
hf_oauth_scopes: [read-access, inference-api]
|
| 14 |
license: mit
|
| 15 |
pinned: true
|
| 16 |
short_description: PDF & HTML parser to markdown
|
converters/pdf_to_md.py
CHANGED
|
@@ -118,9 +118,9 @@ class PdfToMarkdownConverter:
|
|
| 118 |
#duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
|
| 119 |
duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90 ## sec
|
| 120 |
@spaces.GPU(duration=duration) ## HF Spaces GPU support
|
| 121 |
-
|
| 122 |
-
def extract(self, src_path: str, output_dir: str, progress4=grP()): #Dict:
|
| 123 |
-
|
| 124 |
"""
|
| 125 |
Convert one file (PDF/HTML) to Markdown + images.
|
| 126 |
Writes a `.md` file and any extracted images under `output_dir`.
|
|
@@ -152,15 +152,15 @@ class PdfToMarkdownConverter:
|
|
| 152 |
|
| 153 |
# Run Marker conversion with LLM if use_llm is true
|
| 154 |
try:
|
| 155 |
-
progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
|
| 156 |
-
time.sleep(0.75) #.sleep(0.25)
|
| 157 |
|
| 158 |
#rendered = self.docconverter.converter(src_path)
|
| 159 |
rendered = self.converter(src_path)
|
| 160 |
|
| 161 |
logger.log(level=20, msg=f"β File extraction successful for {Path(src_path).name}")
|
| 162 |
-
progress4((1,1), desc=f"β File extraction successful for {Path(src_path).name}")
|
| 163 |
-
time.sleep(0.75) #.sleep(0.25)
|
| 164 |
except Exception as exc:
|
| 165 |
tb = traceback.format_exc()
|
| 166 |
logger.exception(f"Error during file extraction β {exc}\n{tb}", exc_info=True) # Log the full traceback
|
|
|
|
| 118 |
#duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
|
| 119 |
duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90 ## sec
|
| 120 |
@spaces.GPU(duration=duration) ## HF Spaces GPU support
|
| 121 |
+
def extract(self, src_path: str, output_dir: str): ##-> Dict[str, int, Union[str, Path]]:
|
| 122 |
+
#def extract(self, src_path: str, output_dir: str, progress4=grP()): #Dict:
|
| 123 |
+
###def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
|
| 124 |
"""
|
| 125 |
Convert one file (PDF/HTML) to Markdown + images.
|
| 126 |
Writes a `.md` file and any extracted images under `output_dir`.
|
|
|
|
| 152 |
|
| 153 |
# Run Marker conversion with LLM if use_llm is true
|
| 154 |
try:
|
| 155 |
+
#progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
|
| 156 |
+
#time.sleep(0.75) #.sleep(0.25)
|
| 157 |
|
| 158 |
#rendered = self.docconverter.converter(src_path)
|
| 159 |
rendered = self.converter(src_path)
|
| 160 |
|
| 161 |
logger.log(level=20, msg=f"β File extraction successful for {Path(src_path).name}")
|
| 162 |
+
#progress4((1,1), desc=f"β File extraction successful for {Path(src_path).name}")
|
| 163 |
+
#time.sleep(0.75) #.sleep(0.25)
|
| 164 |
except Exception as exc:
|
| 165 |
tb = traceback.format_exc()
|
| 166 |
logger.exception(f"Error during file extraction β {exc}\n{tb}", exc_info=True) # Log the full traceback
|
globals.py
CHANGED
|
@@ -45,6 +45,7 @@ class Config:
|
|
| 45 |
self.page_range: str = None
|
| 46 |
#self.weasyprint_dll_directories: str = None,
|
| 47 |
self.tz_hours: str = None
|
|
|
|
| 48 |
#oauth_token: gr.OAuthToken | None=None,
|
| 49 |
#progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
|
| 50 |
|
|
|
|
| 45 |
self.page_range: str = None
|
| 46 |
#self.weasyprint_dll_directories: str = None,
|
| 47 |
self.tz_hours: str = None
|
| 48 |
+
self.pooling: str = "no_pooling", #bool = True #False
|
| 49 |
#oauth_token: gr.OAuthToken | None=None,
|
| 50 |
#progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
|
| 51 |
|
ui/gradio_process.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
# ui/gradio_process.py
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 5 |
from tqdm import tqdm
|
| 6 |
|
| 7 |
import time
|
|
@@ -65,8 +67,8 @@ def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progre
|
|
| 65 |
for i, pdf_file in enumerate(iterable=progress2.tqdm(
|
| 66 |
iterable=pdf_files, #, max_retries), total=len(pdf_files)
|
| 67 |
desc=f"Processing file conversion ... pool.map",
|
| 68 |
-
total=pdf_files_count)
|
| 69 |
-
|
| 70 |
result_interim = pdf2md_converter.convert_files(pdf_file)
|
| 71 |
|
| 72 |
# Update the Gradio UI to improve user-friendly eXperience
|
|
@@ -79,6 +81,100 @@ def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progre
|
|
| 79 |
|
| 80 |
return results
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
##SMY: TODO: future: refactor to gradio_process.py and
|
| 83 |
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
|
| 84 |
#@spaces.GPU
|
|
@@ -115,6 +211,7 @@ def convert_batch(
|
|
| 115 |
page_range: str = None, #Optional[str] = None,
|
| 116 |
weasyprint_dll_directories: str = None, #weasyprint_libpath
|
| 117 |
tz_hours: str = None,
|
|
|
|
| 118 |
oauth_token: gr.OAuthToken | None=None,
|
| 119 |
progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
|
| 120 |
progress1: gr.Progress = gr.Progress(),
|
|
@@ -188,6 +285,7 @@ def convert_batch(
|
|
| 188 |
weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
|
| 189 |
config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
|
| 190 |
config_load_models.pdf_files_count = pdf_files_count
|
|
|
|
| 191 |
|
| 192 |
progress((3,16), desc=f"Retrieved configuration values")
|
| 193 |
time.sleep(0.25)
|
|
@@ -227,6 +325,7 @@ def convert_batch(
|
|
| 227 |
config_load.page_range = page_range
|
| 228 |
#config_load.weasyprint_dll_directories: str = None,
|
| 229 |
config_load.tz_hours = tz_hours
|
|
|
|
| 230 |
|
| 231 |
# 1. create output_dir
|
| 232 |
try:
|
|
@@ -254,65 +353,32 @@ def convert_batch(
|
|
| 254 |
yield gr.update(interactive=True), f"β An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
|
| 255 |
return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
| 256 |
|
| 257 |
-
# 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
try:
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 262 |
-
progress((7,16), desc=f"Initialising Processing Files ...")
|
| 263 |
time.sleep(0.25)
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
##SMY:
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
#progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
|
| 275 |
-
#time.sleep(0.25)
|
| 276 |
-
|
| 277 |
-
# Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
|
| 278 |
-
# The 'docconverter' argument is implicitly handled by the initialiser
|
| 279 |
-
#futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
|
| 280 |
-
#logs = [f.result() for f in as_completed(futures)]
|
| 281 |
-
#futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
|
| 282 |
-
#logs = [f.result() for f in futures]
|
| 283 |
-
try:
|
| 284 |
-
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 285 |
-
progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
|
| 286 |
-
time.sleep(0.25)
|
| 287 |
-
yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 288 |
-
|
| 289 |
-
'''# Use progress.tqdm to integrate with the executor map
|
| 290 |
-
#results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 291 |
-
for result_interim in progress.tqdm(
|
| 292 |
-
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
| 293 |
-
desc="ProcessPoolExecutor: Pooling file conversion ..."):
|
| 294 |
-
results.append(result_interim)
|
| 295 |
-
|
| 296 |
-
# Update the Gradio UI to improve user-friendly eXperience
|
| 297 |
-
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 298 |
-
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 299 |
-
#progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
|
| 300 |
-
#time.sleep(0.25)'''
|
| 301 |
-
|
| 302 |
-
results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
except Exception as exc:
|
| 309 |
-
# Raise the exception to stop the Gradio app: exception to halt execution
|
| 310 |
-
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 311 |
-
tbp = traceback.print_exc() # Print the exception traceback
|
| 312 |
-
# Update the Gradio UI to improve user-friendly eXperience
|
| 313 |
-
yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
|
| 314 |
-
return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
|
| 315 |
-
|
| 316 |
except Exception as exc:
|
| 317 |
tb = traceback.format_exc()
|
| 318 |
logger.exception(f"β Error during Files processing β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
|
@@ -348,7 +414,7 @@ def convert_batch(
|
|
| 348 |
except Exception as exc:
|
| 349 |
tbp = traceback.print_exc() # Print the exception traceback
|
| 350 |
logger.exception("Error during processing results logs β {exc}\n{tbp}", exc_info=True) # Log the full traceback
|
| 351 |
-
return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{
|
| 352 |
#yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
|
| 353 |
|
| 354 |
|
|
|
|
| 1 |
# ui/gradio_process.py
|
| 2 |
|
| 3 |
+
from re import Match
|
| 4 |
+
from unittest import result
|
| 5 |
import gradio as gr
|
| 6 |
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
| 7 |
from tqdm import tqdm
|
| 8 |
|
| 9 |
import time
|
|
|
|
| 67 |
for i, pdf_file in enumerate(iterable=progress2.tqdm(
|
| 68 |
iterable=pdf_files, #, max_retries), total=len(pdf_files)
|
| 69 |
desc=f"Processing file conversion ... pool.map",
|
| 70 |
+
total=pdf_files_count)
|
| 71 |
+
):
|
| 72 |
result_interim = pdf2md_converter.convert_files(pdf_file)
|
| 73 |
|
| 74 |
# Update the Gradio UI to improve user-friendly eXperience
|
|
|
|
| 81 |
|
| 82 |
return results
|
| 83 |
|
| 84 |
+
def get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
|
| 85 |
+
#Use progress.tqdm to integrate with the executor map
|
| 86 |
+
|
| 87 |
+
results = []
|
| 88 |
+
try:
|
| 89 |
+
# Create a pool with init_worker initialiser
|
| 90 |
+
##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
|
| 91 |
+
with ProcessPoolExecutor(
|
| 92 |
+
max_workers=max_workers,
|
| 93 |
+
) as pool:
|
| 94 |
+
|
| 95 |
+
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files[:3], "files_len": len(pdf_files), "progress": str(progress2),})
|
| 96 |
+
progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
|
| 97 |
+
time.sleep(0.25)
|
| 98 |
+
|
| 99 |
+
# Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
|
| 100 |
+
#try:
|
| 101 |
+
#yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 102 |
+
# progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
|
| 103 |
+
# time.sleep(0.25)
|
| 104 |
+
# yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 105 |
+
|
| 106 |
+
# Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 107 |
+
for i, result_interim in enumerate(progress2.tqdm(
|
| 108 |
+
iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
|
| 109 |
+
desc="ProcessPoolExecutor: Pooling file conversion ...",
|
| 110 |
+
total=pdf_files_count, unit="files")
|
| 111 |
+
):
|
| 112 |
+
|
| 113 |
+
results.append(result_interim)
|
| 114 |
+
|
| 115 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 116 |
+
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 117 |
+
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
|
| 118 |
+
progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
|
| 119 |
+
time.sleep(0.25)
|
| 120 |
+
except Exception as exc:
|
| 121 |
+
# Raise the exception to stop the Gradio app: exception to halt execution
|
| 122 |
+
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 123 |
+
tbp = traceback.print_exc() # Print the exception traceback
|
| 124 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 125 |
+
yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
|
| 126 |
+
return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
|
| 127 |
+
##======
|
| 128 |
+
|
| 129 |
+
return results
|
| 130 |
+
|
| 131 |
+
def get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
|
| 132 |
+
"""
|
| 133 |
+
This function wraps the as_completed call to process results
|
| 134 |
+
as they become available.
|
| 135 |
+
"""
|
| 136 |
+
#Use progress.tqdm to integrate with the executor map
|
| 137 |
+
|
| 138 |
+
results = []
|
| 139 |
+
try:
|
| 140 |
+
# Create a pool with init_worker initialiser
|
| 141 |
+
##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
|
| 142 |
+
with ProcessPoolExecutor(
|
| 143 |
+
max_workers=max_workers,
|
| 144 |
+
) as pool:
|
| 145 |
+
|
| 146 |
+
logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "progress": str(progress2)}) #pdf_files_count
|
| 147 |
+
progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
|
| 148 |
+
time.sleep(0.25)
|
| 149 |
+
|
| 150 |
+
# Submit each task individually and collect the futures
|
| 151 |
+
futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
|
| 152 |
+
|
| 153 |
+
# Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
|
| 154 |
+
for i, future in enumerate(progress2.tqdm(
|
| 155 |
+
iterable=as_completed(futures), #pdf_files,
|
| 156 |
+
desc="ProcessPoolExecutor: Pooling file conversion ...",
|
| 157 |
+
total=pdf_files_count, unit="files")
|
| 158 |
+
):
|
| 159 |
+
result_interim = future.result()
|
| 160 |
+
results.append(result_interim)
|
| 161 |
+
|
| 162 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 163 |
+
yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 164 |
+
#progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
|
| 165 |
+
progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
|
| 166 |
+
time.sleep(0.25)
|
| 167 |
+
except Exception as exc:
|
| 168 |
+
# Raise the exception to stop the Gradio app: exception to halt execution
|
| 169 |
+
logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
|
| 170 |
+
tbp = traceback.print_exc() # Print the exception traceback
|
| 171 |
+
# Update the Gradio UI to improve user-friendly eXperience
|
| 172 |
+
yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
|
| 173 |
+
return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
|
| 174 |
+
##======
|
| 175 |
+
|
| 176 |
+
return results
|
| 177 |
+
|
| 178 |
##SMY: TODO: future: refactor to gradio_process.py and
|
| 179 |
## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
|
| 180 |
#@spaces.GPU
|
|
|
|
| 211 |
page_range: str = None, #Optional[str] = None,
|
| 212 |
weasyprint_dll_directories: str = None, #weasyprint_libpath
|
| 213 |
tz_hours: str = None,
|
| 214 |
+
pooling: str = "no_pooling", #bool = True,
|
| 215 |
oauth_token: gr.OAuthToken | None=None,
|
| 216 |
progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
|
| 217 |
progress1: gr.Progress = gr.Progress(),
|
|
|
|
| 285 |
weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
|
| 286 |
config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
|
| 287 |
config_load_models.pdf_files_count = pdf_files_count
|
| 288 |
+
#pooling = True ##SMY: placeholder
|
| 289 |
|
| 290 |
progress((3,16), desc=f"Retrieved configuration values")
|
| 291 |
time.sleep(0.25)
|
|
|
|
| 325 |
config_load.page_range = page_range
|
| 326 |
#config_load.weasyprint_dll_directories: str = None,
|
| 327 |
config_load.tz_hours = tz_hours
|
| 328 |
+
config_load.pooling = pooling ## placeholder for ProcessPoolExecutor flag
|
| 329 |
|
| 330 |
# 1. create output_dir
|
| 331 |
try:
|
|
|
|
| 353 |
yield gr.update(interactive=True), f"β An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
|
| 354 |
return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
|
| 355 |
|
| 356 |
+
# 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
|
| 357 |
+
results = [] ## Processed files result holder
|
| 358 |
+
logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
|
| 359 |
+
yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 360 |
+
progress((7,16), desc=f"Initialising Processing Files ...")
|
| 361 |
+
time.sleep(0.25)
|
| 362 |
+
|
| 363 |
try:
|
| 364 |
+
#yield gr.update(interactive=True), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 365 |
+
progress((8,16), desc=f"Pooling file conversion ...")
|
|
|
|
|
|
|
| 366 |
time.sleep(0.25)
|
| 367 |
+
yield gr.update(interactive=False), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 368 |
+
|
| 369 |
+
##SMY: Future: users choose sequential or pooling from Gradio ui
|
| 370 |
+
match pooling:
|
| 371 |
+
case "no_pooling":
|
| 372 |
+
results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
|
| 373 |
+
case "pooling":
|
| 374 |
+
results = get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers, progress)
|
| 375 |
+
case "as_completed":
|
| 376 |
+
results = get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers, progress)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
+
logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]})
|
| 379 |
+
yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
|
| 380 |
+
progress((9,16), desc=f"Got Results from files conversion")
|
| 381 |
+
time.sleep(0.25)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
except Exception as exc:
|
| 383 |
tb = traceback.format_exc()
|
| 384 |
logger.exception(f"β Error during Files processing β {exc}\n{tb}" , exc_info=True) # Log the full traceback
|
|
|
|
| 414 |
except Exception as exc:
|
| 415 |
tbp = traceback.print_exc() # Print the exception traceback
|
| 416 |
logger.exception("Error during processing results logs β {exc}\n{tbp}", exc_info=True) # Log the full traceback
|
| 417 |
+
return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tbp}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
|
| 418 |
#yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
|
| 419 |
|
| 420 |
|
ui/gradio_ui.py
CHANGED
|
@@ -80,7 +80,7 @@ def build_interface() -> gr.Blocks:
|
|
| 80 |
label="Max Tokens",
|
| 81 |
minimum=1,
|
| 82 |
maximum=131172, #65536, #32768, #16384, #8192,
|
| 83 |
-
value=1024, #512,
|
| 84 |
step=1,
|
| 85 |
)
|
| 86 |
temperature_sl = gr.Slider(
|
|
@@ -140,11 +140,18 @@ def build_interface() -> gr.Blocks:
|
|
| 140 |
value="markdown",
|
| 141 |
)
|
| 142 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
max_workers_sl = gr.Slider(
|
| 144 |
label="Max Worker",
|
| 145 |
minimum=1,
|
| 146 |
maximum=4,
|
| 147 |
-
value=
|
| 148 |
step=1
|
| 149 |
)
|
| 150 |
max_retries_sl = gr.Slider(
|
|
@@ -519,7 +526,8 @@ def build_interface() -> gr.Blocks:
|
|
| 519 |
disable_ocr_math_cb,
|
| 520 |
page_range_tb,
|
| 521 |
weasyprint_dll_directories_tb,
|
| 522 |
-
tz_hours_num, #state_tz_hours
|
|
|
|
| 523 |
]
|
| 524 |
|
| 525 |
## debug
|
|
|
|
| 80 |
label="Max Tokens",
|
| 81 |
minimum=1,
|
| 82 |
maximum=131172, #65536, #32768, #16384, #8192,
|
| 83 |
+
value=8192, #1024, #512,
|
| 84 |
step=1,
|
| 85 |
)
|
| 86 |
temperature_sl = gr.Slider(
|
|
|
|
| 140 |
value="markdown",
|
| 141 |
)
|
| 142 |
with gr.Row():
|
| 143 |
+
#pooling_cb = gr.Checkbox(
|
| 144 |
+
pooling_dd = gr.Dropdown(
|
| 145 |
+
label="Pool: multiprocessing",
|
| 146 |
+
info="Enable for high # of files [Beware!]",
|
| 147 |
+
value="no_pooling", #True, #False
|
| 148 |
+
choices=["no_pooling", "pooling", "as_completed"]
|
| 149 |
+
)
|
| 150 |
max_workers_sl = gr.Slider(
|
| 151 |
label="Max Worker",
|
| 152 |
minimum=1,
|
| 153 |
maximum=4,
|
| 154 |
+
value=3,
|
| 155 |
step=1
|
| 156 |
)
|
| 157 |
max_retries_sl = gr.Slider(
|
|
|
|
| 526 |
disable_ocr_math_cb,
|
| 527 |
page_range_tb,
|
| 528 |
weasyprint_dll_directories_tb,
|
| 529 |
+
tz_hours_num, #state_tz_hours
|
| 530 |
+
pooling_dd,
|
| 531 |
]
|
| 532 |
|
| 533 |
## debug
|
utils/file_utils.py
CHANGED
|
@@ -223,7 +223,8 @@ def check_create_file(filename: Union[str, Path]) -> Path:
|
|
| 223 |
except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
|
| 224 |
warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
|
| 225 |
filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
|
| 226 |
-
filename_path.chmod(0)
|
|
|
|
| 227 |
|
| 228 |
return filename_path
|
| 229 |
|
|
@@ -524,7 +525,8 @@ def write_markdown(
|
|
| 524 |
##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 525 |
md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
|
| 526 |
#md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
|
| 527 |
-
md_path.parent.chmod(0)
|
|
|
|
| 528 |
|
| 529 |
try:
|
| 530 |
#markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
|
|
|
|
| 223 |
except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
|
| 224 |
warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
|
| 225 |
filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
|
| 226 |
+
#filename_path.chmod(0)
|
| 227 |
+
filename_path.chmod(0o2755)
|
| 228 |
|
| 229 |
return filename_path
|
| 230 |
|
|
|
|
| 525 |
##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
|
| 526 |
md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
|
| 527 |
#md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
|
| 528 |
+
#md_path.parent.chmod(0) ##resets permission (none): d--------- 2
|
| 529 |
+
md_path.parent.chmod(mode=0o2755)
|
| 530 |
|
| 531 |
try:
|
| 532 |
#markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
|