semmyk commited on
Commit
15e9c77
Β·
1 Parent(s): c6fb648

baseline08_beta0.4.1_07Oct25: fix permissions: oauth inference-api, write output markdown

Browse files
README.md CHANGED
@@ -9,7 +9,8 @@ python_version: 3.12
9
  command: python main.py
10
  app_file: main.py
11
  hf_oauth: true
12
- oauth_scopes: [read-access]
 
13
  license: mit
14
  pinned: true
15
  short_description: PDF & HTML parser to markdown
 
9
  command: python main.py
10
  app_file: main.py
11
  hf_oauth: true
12
+ #oauth_scopes: [read-access]
13
+ hf_oauth_scopes: [read-access, inference-api]
14
  license: mit
15
  pinned: true
16
  short_description: PDF & HTML parser to markdown
converters/pdf_to_md.py CHANGED
@@ -118,9 +118,9 @@ class PdfToMarkdownConverter:
118
  #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
119
  duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90 ## sec
120
  @spaces.GPU(duration=duration) ## HF Spaces GPU support
121
- #def extract(self, src_path: str, output_dir: str) -> Dict[str, int, Union[str, Path]]:
122
- def extract(self, src_path: str, output_dir: str, progress4=grP()): #Dict:
123
- #def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
124
  """
125
  Convert one file (PDF/HTML) to Markdown + images.
126
  Writes a `.md` file and any extracted images under `output_dir`.
@@ -152,15 +152,15 @@ class PdfToMarkdownConverter:
152
 
153
  # Run Marker conversion with LLM if use_llm is true
154
  try:
155
- progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
156
- time.sleep(0.75) #.sleep(0.25)
157
 
158
  #rendered = self.docconverter.converter(src_path)
159
  rendered = self.converter(src_path)
160
 
161
  logger.log(level=20, msg=f"βœ“ File extraction successful for {Path(src_path).name}")
162
- progress4((1,1), desc=f"βœ“ File extraction successful for {Path(src_path).name}")
163
- time.sleep(0.75) #.sleep(0.25)
164
  except Exception as exc:
165
  tb = traceback.format_exc()
166
  logger.exception(f"Error during file extraction β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
 
118
  #duration = 60*config_load_models.pdf_files_count if config_load_models.pdf_files_count>=10 else 360 ## sec
119
  duration = 60*config_load_models.pdf_files_count if config_load_models.use_llm else 90 ## sec
120
  @spaces.GPU(duration=duration) ## HF Spaces GPU support
121
+ def extract(self, src_path: str, output_dir: str): ##-> Dict[str, int, Union[str, Path]]:
122
+ #def extract(self, src_path: str, output_dir: str, progress4=grP()): #Dict:
123
+ ###def extract(src_path: str, output_dir: str) -> Dict[str, int]: #, extractor: DocumentExtractor) -> Dict[str, int]:
124
  """
125
  Convert one file (PDF/HTML) to Markdown + images.
126
  Writes a `.md` file and any extracted images under `output_dir`.
 
152
 
153
  # Run Marker conversion with LLM if use_llm is true
154
  try:
155
+ #progress4((0,1), desc=f"Extracting File: {Path(src_path).name}")
156
+ #time.sleep(0.75) #.sleep(0.25)
157
 
158
  #rendered = self.docconverter.converter(src_path)
159
  rendered = self.converter(src_path)
160
 
161
  logger.log(level=20, msg=f"βœ“ File extraction successful for {Path(src_path).name}")
162
+ #progress4((1,1), desc=f"βœ“ File extraction successful for {Path(src_path).name}")
163
+ #time.sleep(0.75) #.sleep(0.25)
164
  except Exception as exc:
165
  tb = traceback.format_exc()
166
  logger.exception(f"Error during file extraction β†’ {exc}\n{tb}", exc_info=True) # Log the full traceback
globals.py CHANGED
@@ -45,6 +45,7 @@ class Config:
45
  self.page_range: str = None
46
  #self.weasyprint_dll_directories: str = None,
47
  self.tz_hours: str = None
 
48
  #oauth_token: gr.OAuthToken | None=None,
49
  #progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
50
 
 
45
  self.page_range: str = None
46
  #self.weasyprint_dll_directories: str = None,
47
  self.tz_hours: str = None
48
+ self.pooling: str = "no_pooling", #bool = True #False
49
  #oauth_token: gr.OAuthToken | None=None,
50
  #progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
51
 
ui/gradio_process.py CHANGED
@@ -1,7 +1,9 @@
1
  # ui/gradio_process.py
2
 
 
 
3
  import gradio as gr
4
- from concurrent.futures import ProcessPoolExecutor, as_completed
5
  from tqdm import tqdm
6
 
7
  import time
@@ -65,8 +67,8 @@ def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progre
65
  for i, pdf_file in enumerate(iterable=progress2.tqdm(
66
  iterable=pdf_files, #, max_retries), total=len(pdf_files)
67
  desc=f"Processing file conversion ... pool.map",
68
- total=pdf_files_count),
69
- start=1):
70
  result_interim = pdf2md_converter.convert_files(pdf_file)
71
 
72
  # Update the Gradio UI to improve user-friendly eXperience
@@ -79,6 +81,100 @@ def get_results_files_conversion(pdf_files, pdf_files_count, progress2=gr.Progre
79
 
80
  return results
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ##SMY: TODO: future: refactor to gradio_process.py and
83
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
84
  #@spaces.GPU
@@ -115,6 +211,7 @@ def convert_batch(
115
  page_range: str = None, #Optional[str] = None,
116
  weasyprint_dll_directories: str = None, #weasyprint_libpath
117
  tz_hours: str = None,
 
118
  oauth_token: gr.OAuthToken | None=None,
119
  progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
120
  progress1: gr.Progress = gr.Progress(),
@@ -188,6 +285,7 @@ def convert_batch(
188
  weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
189
  config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
190
  config_load_models.pdf_files_count = pdf_files_count
 
191
 
192
  progress((3,16), desc=f"Retrieved configuration values")
193
  time.sleep(0.25)
@@ -227,6 +325,7 @@ def convert_batch(
227
  config_load.page_range = page_range
228
  #config_load.weasyprint_dll_directories: str = None,
229
  config_load.tz_hours = tz_hours
 
230
 
231
  # 1. create output_dir
232
  try:
@@ -254,65 +353,32 @@ def convert_batch(
254
  yield gr.update(interactive=True), f"βœ— An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
255
  return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
256
 
257
- # 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
 
 
 
 
 
 
258
  try:
259
- results = [] ## Processed files result holder
260
- logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
261
- yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
262
- progress((7,16), desc=f"Initialising Processing Files ...")
263
  time.sleep(0.25)
264
-
265
- # Create a pool with init_worker initialiser
266
- ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
267
- '''with ProcessPoolExecutor(
268
- max_workers=max_workers,
269
- initializer=init_worker,
270
- initargs=init_args
271
- ) as pool:'''
272
-
273
- #logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
274
- #progress((8,16), desc=f"Starting ProcessPool queue: Processing Files ...")
275
- #time.sleep(0.25)
276
-
277
- # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
278
- # The 'docconverter' argument is implicitly handled by the initialiser
279
- #futures = [pool.map(pdf2md_converter.convert_files, f) for f in pdf_files]
280
- #logs = [f.result() for f in as_completed(futures)]
281
- #futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
282
- #logs = [f.result() for f in futures]
283
- try:
284
- #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
285
- progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
286
- time.sleep(0.25)
287
- yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
288
-
289
- '''# Use progress.tqdm to integrate with the executor map
290
- #results = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
291
- for result_interim in progress.tqdm(
292
- iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
293
- desc="ProcessPoolExecutor: Pooling file conversion ..."):
294
- results.append(result_interim)
295
-
296
- # Update the Gradio UI to improve user-friendly eXperience
297
- #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]", {"process": "Processing files ..."}, f"dummy_log.log"
298
- #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
299
- #progress2((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)}[:20]]")
300
- #time.sleep(0.25)'''
301
-
302
- results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
303
 
304
- logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]})
305
- yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
306
- progress((11,16), desc=f"Got Results from files conversion")
307
- time.sleep(0.25)
308
- except Exception as exc:
309
- # Raise the exception to stop the Gradio app: exception to halt execution
310
- logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
311
- tbp = traceback.print_exc() # Print the exception traceback
312
- # Update the Gradio UI to improve user-friendly eXperience
313
- yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
314
- return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
315
-
316
  except Exception as exc:
317
  tb = traceback.format_exc()
318
  logger.exception(f"βœ— Error during Files processing β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
@@ -348,7 +414,7 @@ def convert_batch(
348
  except Exception as exc:
349
  tbp = traceback.print_exc() # Print the exception traceback
350
  logger.exception("Error during processing results logs β†’ {exc}\n{tbp}", exc_info=True) # Log the full traceback
351
- return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
352
  #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
353
 
354
 
 
1
  # ui/gradio_process.py
2
 
3
+ from re import Match
4
+ from unittest import result
5
  import gradio as gr
6
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
7
  from tqdm import tqdm
8
 
9
  import time
 
67
  for i, pdf_file in enumerate(iterable=progress2.tqdm(
68
  iterable=pdf_files, #, max_retries), total=len(pdf_files)
69
  desc=f"Processing file conversion ... pool.map",
70
+ total=pdf_files_count)
71
+ ):
72
  result_interim = pdf2md_converter.convert_files(pdf_file)
73
 
74
  # Update the Gradio UI to improve user-friendly eXperience
 
81
 
82
  return results
83
 
84
+ def get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
85
+ #Use progress.tqdm to integrate with the executor map
86
+
87
+ results = []
88
+ try:
89
+ # Create a pool with init_worker initialiser
90
+ ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
91
+ with ProcessPoolExecutor(
92
+ max_workers=max_workers,
93
+ ) as pool:
94
+
95
+ logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files[:3], "files_len": len(pdf_files), "progress": str(progress2),})
96
+ progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
97
+ time.sleep(0.25)
98
+
99
+ # Map the files (pdf_files) to the conversion function (pdf2md_converter.convert_file)
100
+ #try:
101
+ #yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
102
+ # progress((9,16), desc=f"ProcessPoolExecutor: Pooling file conversion ...")
103
+ # time.sleep(0.25)
104
+ # yield gr.update(interactive=False), f"ProcessPoolExecutor: Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
105
+
106
+ # Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
107
+ for i, result_interim in enumerate(progress2.tqdm(
108
+ iterable=pool.map(pdf2md_converter.convert_files, pdf_files), #, max_retries), total=len(pdf_files)
109
+ desc="ProcessPoolExecutor: Pooling file conversion ...",
110
+ total=pdf_files_count, unit="files")
111
+ ):
112
+
113
+ results.append(result_interim)
114
+
115
+ # Update the Gradio UI to improve user-friendly eXperience
116
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
117
+ #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
118
+ progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
119
+ time.sleep(0.25)
120
+ except Exception as exc:
121
+ # Raise the exception to stop the Gradio app: exception to halt execution
122
+ logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
123
+ tbp = traceback.print_exc() # Print the exception traceback
124
+ # Update the Gradio UI to improve user-friendly eXperience
125
+ yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
126
+ return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
127
+ ##======
128
+
129
+ return results
130
+
131
+ def get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers: int, progress2=gr.Progress(track_tqdm=True)):
132
+ """
133
+ This function wraps the as_completed call to process results
134
+ as they become available.
135
+ """
136
+ #Use progress.tqdm to integrate with the executor map
137
+
138
+ results = []
139
+ try:
140
+ # Create a pool with init_worker initialiser
141
+ ##SMY: dropped ProcessPoolExecutor due to slow Marker conversion.Marker already leverage ThreadPoolExecutor and ProcessPoolExecutor
142
+ with ProcessPoolExecutor(
143
+ max_workers=max_workers,
144
+ ) as pool:
145
+
146
+ logger.log(level=30, msg="Initialising ProcessPoolExecutor: pool:", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "progress": str(progress2)}) #pdf_files_count
147
+ progress2((10,16), desc=f"Starting ProcessPool queue: Processing Files ...")
148
+ time.sleep(0.25)
149
+
150
+ # Submit each task individually and collect the futures
151
+ futures = [pool.submit(pdf2md_converter.convert_files, file) for file in pdf_files]
152
+
153
+ # Use progress.tqdm to integrate with the executor mapresults = pool.map(pdf2md_converter.convert_files, pdf_files) ##SMY iterables #max_retries #output_dir_string)
154
+ for i, future in enumerate(progress2.tqdm(
155
+ iterable=as_completed(futures), #pdf_files,
156
+ desc="ProcessPoolExecutor: Pooling file conversion ...",
157
+ total=pdf_files_count, unit="files")
158
+ ):
159
+ result_interim = future.result()
160
+ results.append(result_interim)
161
+
162
+ # Update the Gradio UI to improve user-friendly eXperience
163
+ yield gr.update(interactive=True), f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
164
+ #progress((10,16), desc=f"ProcessPoolExecutor: Pooling file conversion result: [{str(result_interim)[:20]}]")
165
+ progress2((i, pdf_files_count), desc=f"ProcessPoolExecutor: Pooling file conversion result: {i} : [{str(result_interim)[:20]}]")
166
+ time.sleep(0.25)
167
+ except Exception as exc:
168
+ # Raise the exception to stop the Gradio app: exception to halt execution
169
+ logger.exception("Error during pooling file conversion", exc_info=True) # Log the full traceback
170
+ tbp = traceback.print_exc() # Print the exception traceback
171
+ # Update the Gradio UI to improve user-friendly eXperience
172
+ yield gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log" ## return the exception message
173
+ return [gr.update(interactive=True), f"An error occurred during pool.map: {str(exc)}", {"Error":f"Error: {exc}\n{tbp}"}, f"dummy_log.log"] ## return the exception message
174
+ ##======
175
+
176
+ return results
177
+
178
  ##SMY: TODO: future: refactor to gradio_process.py and
179
  ## pull options to cli-options{"output_format":, "output_dir_string":, "use_llm":, "page_range":, "force_ocr":, "debug":, "strip_existing_ocr":, "disable_ocr_math""}
180
  #@spaces.GPU
 
211
  page_range: str = None, #Optional[str] = None,
212
  weasyprint_dll_directories: str = None, #weasyprint_libpath
213
  tz_hours: str = None,
214
+ pooling: str = "no_pooling", #bool = True,
215
  oauth_token: gr.OAuthToken | None=None,
216
  progress: gr.Progress = gr.Progress(track_tqdm=True), #Progress tracker to keep tab on pool queue executor
217
  progress1: gr.Progress = gr.Progress(),
 
285
  weasyprint_dll_directories= weasyprint_dll_directories if weasyprint_dll_directories else None
286
  config_load_models.weasyprint_libpath = weasyprint_dll_directories ## Assign user's weasyprint path to Global var
287
  config_load_models.pdf_files_count = pdf_files_count
288
+ #pooling = True ##SMY: placeholder
289
 
290
  progress((3,16), desc=f"Retrieved configuration values")
291
  time.sleep(0.25)
 
325
  config_load.page_range = page_range
326
  #config_load.weasyprint_dll_directories: str = None,
327
  config_load.tz_hours = tz_hours
328
+ config_load.pooling = pooling ## placeholder for ProcessPoolExecutor flag
329
 
330
  # 1. create output_dir
331
  try:
 
353
  yield gr.update(interactive=True), f"βœ— An error occurred creating output_dir: {str(exc)}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
354
  return f"An error occurred creating output_dir: {str(exc)}", f"Error: {exc}", f"Error: {exc}" ## return the exception message
355
 
356
+ # 2. Process file conversion leveraging ProcessPoolExecutor for efficiency
357
+ results = [] ## Processed files result holder
358
+ logger.log(level=30, msg="Initialising Processing Files ...", extra={"pdf_files": pdf_files, "files_len": len(pdf_files), "model_id": model_id, "output_dir": output_dir_string}) #pdf_files_count
359
+ yield gr.update(interactive=False), f"Initialising Processing Files ...", {"process": "Processing files ..."}, f"dummy_log.log"
360
+ progress((7,16), desc=f"Initialising Processing Files ...")
361
+ time.sleep(0.25)
362
+
363
  try:
364
+ #yield gr.update(interactive=True), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
365
+ progress((8,16), desc=f"Pooling file conversion ...")
 
 
366
  time.sleep(0.25)
367
+ yield gr.update(interactive=False), f"Pooling file conversion ...", {"process": "Processing files ..."}, f"dummy_log.log"
368
+
369
+ ##SMY: Future: users choose sequential or pooling from Gradio ui
370
+ match pooling:
371
+ case "no_pooling":
372
+ results = get_results_files_conversion(pdf_files, pdf_files_count,progress)
373
+ case "pooling":
374
+ results = get_results_files_conversion_with_pool(pdf_files, pdf_files_count, max_workers, progress)
375
+ case "as_completed":
376
+ results = get_results_files_conversion_with_pool_ascomplete(pdf_files, pdf_files_count, max_workers, progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ logger.log(level=30, msg="Got Results from files conversion: ", extra={"results": str(results)[:20]})
379
+ yield gr.update(interactive=True), f"Got Results from files conversion: [{str(results)[:20]}]", {"process": "Processing files ..."}, f"dummy_log.log"
380
+ progress((9,16), desc=f"Got Results from files conversion")
381
+ time.sleep(0.25)
 
 
 
 
 
 
 
 
382
  except Exception as exc:
383
  tb = traceback.format_exc()
384
  logger.exception(f"βœ— Error during Files processing β†’ {exc}\n{tb}" , exc_info=True) # Log the full traceback
 
414
  except Exception as exc:
415
  tbp = traceback.print_exc() # Print the exception traceback
416
  logger.exception("Error during processing results logs β†’ {exc}\n{tbp}", exc_info=True) # Log the full traceback
417
+ return [gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tbp}", {"Error":f"Error: {exc}"}, f"dummy_log.log"] ## return the exception message
418
  #yield gr.update(interactive=True), f"An error occurred during processing results logs: {str(exc)}\n{tb}", {"Error":f"Error: {exc}"}, f"dummy_log.log" ## return the exception message
419
 
420
 
ui/gradio_ui.py CHANGED
@@ -80,7 +80,7 @@ def build_interface() -> gr.Blocks:
80
  label="Max Tokens",
81
  minimum=1,
82
  maximum=131172, #65536, #32768, #16384, #8192,
83
- value=1024, #512,
84
  step=1,
85
  )
86
  temperature_sl = gr.Slider(
@@ -140,11 +140,18 @@ def build_interface() -> gr.Blocks:
140
  value="markdown",
141
  )
142
  with gr.Row():
 
 
 
 
 
 
 
143
  max_workers_sl = gr.Slider(
144
  label="Max Worker",
145
  minimum=1,
146
  maximum=4,
147
- value=1,
148
  step=1
149
  )
150
  max_retries_sl = gr.Slider(
@@ -519,7 +526,8 @@ def build_interface() -> gr.Blocks:
519
  disable_ocr_math_cb,
520
  page_range_tb,
521
  weasyprint_dll_directories_tb,
522
- tz_hours_num, #state_tz_hours
 
523
  ]
524
 
525
  ## debug
 
80
  label="Max Tokens",
81
  minimum=1,
82
  maximum=131172, #65536, #32768, #16384, #8192,
83
+ value=8192, #1024, #512,
84
  step=1,
85
  )
86
  temperature_sl = gr.Slider(
 
140
  value="markdown",
141
  )
142
  with gr.Row():
143
+ #pooling_cb = gr.Checkbox(
144
+ pooling_dd = gr.Dropdown(
145
+ label="Pool: multiprocessing",
146
+ info="Enable for high # of files [Beware!]",
147
+ value="no_pooling", #True, #False
148
+ choices=["no_pooling", "pooling", "as_completed"]
149
+ )
150
  max_workers_sl = gr.Slider(
151
  label="Max Worker",
152
  minimum=1,
153
  maximum=4,
154
+ value=3,
155
  step=1
156
  )
157
  max_retries_sl = gr.Slider(
 
526
  disable_ocr_math_cb,
527
  page_range_tb,
528
  weasyprint_dll_directories_tb,
529
+ tz_hours_num, #state_tz_hours
530
+ pooling_dd,
531
  ]
532
 
533
  ## debug
utils/file_utils.py CHANGED
@@ -223,7 +223,8 @@ def check_create_file(filename: Union[str, Path]) -> Path:
223
  except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
224
  warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
225
  filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
226
- filename_path.chmod(0)
 
227
 
228
  return filename_path
229
 
@@ -524,7 +525,8 @@ def write_markdown(
524
  ##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
525
  md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
526
  #md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
527
- md_path.parent.chmod(0)
 
528
 
529
  try:
530
  #markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown
 
223
  except PermissionError: ##[Errno 13] Permission denied: '/home/user/app/logs/app_logging_2025-09-18.log'
224
  warnings.warn("[Errno 13] Permission denied, possibly insufficient permission or Persistent Storage not enable: attempting chmod 0o2644")
225
  filename_path.touch(exist_ok=True, mode=0o2755) # Creates an empty file if it doesn't exists
226
+ #filename_path.chmod(0)
227
+ filename_path.chmod(0o2755)
228
 
229
  return filename_path
230
 
 
525
  ##SMY: [resolved] Permission Errno13 - https://stackoverflow.com/a/57454275
526
  md_path.parent.mkdir(mode=0o2755, parents=True, exist_ok=True) ##SMY: create nested md_path if not exists
527
  #md_path.parent.mkdir(parents=True, exist_ok=True) ##SMY: md_path now resides in Temp
528
+ #md_path.parent.chmod(0) ##resets permission (none): d--------- 2
529
+ md_path.parent.chmod(mode=0o2755)
530
 
531
  try:
532
  #markdown_text = getattr(rendered, "markdown") ##SMY: get extracted markdown