semmyk commited on
Commit
78b3275
·
1 Parent(s): 9d4150d

baseline08_beta0.4.2_07Oct25: [crossplatform] fix serialising JSON string: process_dicts_data

Browse files
Files changed (2) hide show
  1. README.md +7 -0
  2. utils/file_utils.py +18 -3
README.md CHANGED
@@ -78,6 +78,8 @@ requires-python: ">=3.12"
78
  ---
79
 
80
  # parserPDF
 
 
81
 
82
  [![Gradio](https://img.shields.io/badge/Gradio-SDK-amber?logo=gradio)](https://www.gradio.app/)
83
  [![Python](https://img.shields.io/badge/Python->=3.12-blue?logo=python)](https://www.python.org/)
@@ -230,6 +232,11 @@ Test Structure
230
  - tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
231
  - tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
232
 
 
 
 
 
 
233
 
234
  ## License
235
  MIT License. See [LICENSE](LICENSE) for details.
 
78
  ---
79
 
80
  # parserPDF
81
+ [Pipeline for (in-progress) integrated RAG-KG platform]
82
+ __________
83
 
84
  [![Gradio](https://img.shields.io/badge/Gradio-SDK-amber?logo=gradio)](https://www.gradio.app/)
85
  [![Python](https://img.shields.io/badge/Python->=3.12-blue?logo=python)](https://www.python.org/)
 
232
  - tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
233
  - tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
234
 
235
+ ## Roadmap (No concrete timeline for now)
236
+ 1. Q4 2025: Document viewer pre-processing and processed document view post-processing
237
+ 2. Q4 2025: Integration with RAG-KG (knowledge graph)
238
+ 3. MCP server and Faster conversion rate
239
+ 4. Integrated research platform for document realist-based analysis
240
 
241
  ## License
242
  MIT License. See [LICENSE](LICENSE) for details.
utils/file_utils.py CHANGED
@@ -358,19 +358,34 @@ def is_file_with_extension(path_obj: Path) -> bool:
358
  def process_dicts_data(data:Union[dict, list[dict]]):
359
  """ Returns formatted JSON string for a single dictionary or a list of dictionaries"""
360
  import json
361
- from pathlib import WindowsPath
362
  #from typing import dict, list
363
 
364
  # Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
365
  class PathEncoder(json.JSONEncoder):
366
  def default(self, obj):
367
- if isinstance(obj, WindowsPath):
 
368
  return str(obj)
369
  # Let the base class default method raise the TypeError for other types
370
- return json.JSONEncoder.default(self, obj)
 
371
 
372
  # Convert the list of dicts to a formatted JSON string
373
  formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
  return formatted_string
376
 
 
358
  def process_dicts_data(data:Union[dict, list[dict]]):
359
  """ Returns formatted JSON string for a single dictionary or a list of dictionaries"""
360
  import json
361
+ from pathlib import Path #WindowsPath
362
  #from typing import dict, list
363
 
364
  # Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
365
  class PathEncoder(json.JSONEncoder):
366
  def default(self, obj):
367
+ #if isinstance(obj, WindowsPath):
368
+ if isinstance(obj, Path):
369
  return str(obj)
370
  # Let the base class default method raise the TypeError for other types
371
+ #return json.JSONEncoder.default(self, obj)
372
+ return super().default(obj) # Use super().default() for better inheritance
373
 
374
  # Convert the list of dicts to a formatted JSON string
375
  formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
376
+
377
+ '''
378
+ def path_to_str(obj):
379
+ """
380
+ A simple function to convert pathlib.Path objects to strings.
381
+ """
382
+ if isinstance(obj, Path):
383
+ return str(obj)
384
+ raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
385
+
386
+ # Convert the list of dicts to a formatted JSON string
387
+ formatted_string = json.dumps(data, indent=4, default=path_to_str)
388
+ '''
389
 
390
  return formatted_string
391