Spaces:
Running
on
Zero
Running
on
Zero
baseline08_beta0.4.2_07Oct25: [crossplatform] fix serialising JSON string: process_dicts_data
Browse files- README.md +7 -0
- utils/file_utils.py +18 -3
README.md
CHANGED
|
@@ -78,6 +78,8 @@ requires-python: ">=3.12"
|
|
| 78 |
---
|
| 79 |
|
| 80 |
# parserPDF
|
|
|
|
|
|
|
| 81 |
|
| 82 |
[](https://www.gradio.app/)
|
| 83 |
[](https://www.python.org/)
|
|
@@ -230,6 +232,11 @@ Test Structure
|
|
| 230 |
- tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
|
| 231 |
- tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
## License
|
| 235 |
MIT License. See [LICENSE](LICENSE) for details.
|
|
|
|
| 78 |
---
|
| 79 |
|
| 80 |
# parserPDF
|
| 81 |
+
[Pipeline for (in-progress) integrated RAG-KG platform]
|
| 82 |
+
__________
|
| 83 |
|
| 84 |
[](https://www.gradio.app/)
|
| 85 |
[](https://www.python.org/)
|
|
|
|
| 232 |
- tests/test_llm.py: Tests LLM login, provider validation, Hugging Face/OpenAI client initialization, and API interactions.
|
| 233 |
- tests/test_main_ui.py: Tests main application logic, UI building, batch conversion, file accumulation, and ProcessPoolExecutor integration.
|
| 234 |
|
| 235 |
+
## Roadmap (No concrete timeline for now)
|
| 236 |
+
1. Q4 2025: Document viewer pre-processing and processed document view post-processing
|
| 237 |
+
2. Q4 2025: Integration with RAG-KG (knowledge graph)
|
| 238 |
+
3. MCP server and Faster conversion rate
|
| 239 |
+
4. Integrated research platform for document realist-based analysis
|
| 240 |
|
| 241 |
## License
|
| 242 |
MIT License. See [LICENSE](LICENSE) for details.
|
utils/file_utils.py
CHANGED
|
@@ -358,19 +358,34 @@ def is_file_with_extension(path_obj: Path) -> bool:
|
|
| 358 |
def process_dicts_data(data:Union[dict, list[dict]]):
|
| 359 |
""" Returns formatted JSON string for a single dictionary or a list of dictionaries"""
|
| 360 |
import json
|
| 361 |
-
from pathlib import WindowsPath
|
| 362 |
#from typing import dict, list
|
| 363 |
|
| 364 |
# Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
|
| 365 |
class PathEncoder(json.JSONEncoder):
|
| 366 |
def default(self, obj):
|
| 367 |
-
if isinstance(obj, WindowsPath):
|
|
|
|
| 368 |
return str(obj)
|
| 369 |
# Let the base class default method raise the TypeError for other types
|
| 370 |
-
return json.JSONEncoder.default(self, obj)
|
|
|
|
| 371 |
|
| 372 |
# Convert the list of dicts to a formatted JSON string
|
| 373 |
formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
return formatted_string
|
| 376 |
|
|
|
|
| 358 |
def process_dicts_data(data:Union[dict, list[dict]]):
|
| 359 |
""" Returns formatted JSON string for a single dictionary or a list of dictionaries"""
|
| 360 |
import json
|
| 361 |
+
from pathlib import Path #WindowsPath
|
| 362 |
#from typing import dict, list
|
| 363 |
|
| 364 |
# Serialise WindowsPath objects to strings using custom json.JSoNEncoder subclass
|
| 365 |
class PathEncoder(json.JSONEncoder):
|
| 366 |
def default(self, obj):
|
| 367 |
+
#if isinstance(obj, WindowsPath):
|
| 368 |
+
if isinstance(obj, Path):
|
| 369 |
return str(obj)
|
| 370 |
# Let the base class default method raise the TypeError for other types
|
| 371 |
+
#return json.JSONEncoder.default(self, obj)
|
| 372 |
+
return super().default(obj) # Use super().default() for better inheritance
|
| 373 |
|
| 374 |
# Convert the list of dicts to a formatted JSON string
|
| 375 |
formatted_string = json.dumps(data, indent=4, cls=PathEncoder)
|
| 376 |
+
|
| 377 |
+
'''
|
| 378 |
+
def path_to_str(obj):
|
| 379 |
+
"""
|
| 380 |
+
A simple function to convert pathlib.Path objects to strings.
|
| 381 |
+
"""
|
| 382 |
+
if isinstance(obj, Path):
|
| 383 |
+
return str(obj)
|
| 384 |
+
raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
|
| 385 |
+
|
| 386 |
+
# Convert the list of dicts to a formatted JSON string
|
| 387 |
+
formatted_string = json.dumps(data, indent=4, default=path_to_str)
|
| 388 |
+
'''
|
| 389 |
|
| 390 |
return formatted_string
|
| 391 |
|