File size: 5,243 Bytes
ec8ecd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import json
import pandas as pd
from typing import List, Dict, Any
from config import Config
from tqdm.auto import tqdm
class LegalDataLoader:
"""Load and process legal corpus"""
def __init__(self):
self.legal_corpus = None
def load_legal_corpus(self) -> List[Dict[str, Any]]:
"""Load legal corpus from JSON file"""
try:
with open(Config.CORPUS_PATH, "r", encoding="utf-8") as f:
self.legal_corpus = json.load(f)
# Handle the case where the corpus is a list of laws with nested articles
if isinstance(self.legal_corpus, list):
print(f"Loaded {len(self.legal_corpus)} legal documents")
else:
# Handle single law document format
print(
f"Loaded legal document: {self.legal_corpus.get('law_id', 'Unknown')}"
)
self.legal_corpus = [self.legal_corpus]
return self.legal_corpus
except FileNotFoundError:
print(f"Legal corpus file not found at {Config.CORPUS_PATH}")
return []
except json.JSONEncoder as e:
print(f"Error parsing JSON file: {e}")
return []
def prepare_documents_for_indexing(self) -> List[Dict[str, Any]]:
"""Prepare legal documents for vector indexing"""
if self.legal_corpus is None:
self.load_legal_corpus()
documents = []
for law in tqdm(self.legal_corpus):
law_id = law.get("law_id", "")
articles = law.get("articles", [])
# Process each article in the law
for article in articles:
article_id = article.get("article_id", "")
title = article.get("title", "")
content = article.get("text", "")
if content and content.strip():
# Create unique document ID combining law_id and article_id
doc_id = (
f"{law_id}_{article_id}"
if law_id and article_id
else article_id
)
documents.append(
{
"id": doc_id,
"title": title,
"content": content,
"metadata": {
"law_id": law_id,
"article_id": article_id,
"title": title,
"source": "legal_corpus",
},
}
)
print(f"Prepared {len(documents)} documents for indexing")
return documents
def get_document_by_id(self, doc_id: str) -> Dict[str, Any]:
"""Get a specific document by ID"""
if self.legal_corpus is None:
self.load_legal_corpus()
# Handle both formats: "law_id_article_id" or just "article_id"
for law in self.legal_corpus:
law_id = law.get("law_id", "")
articles = law.get("articles", [])
for article in articles:
article_id = article.get("article_id", "")
combined_id = (
f"{law_id}_{article_id}" if law_id and article_id else article_id
)
if combined_id == doc_id or article_id == doc_id:
return {
"law_id": law_id,
"article_id": article_id,
"title": article.get("title", ""),
"text": article.get("text", ""),
"combined_id": combined_id,
}
return {}
def search_documents_by_keyword(self, keyword: str) -> List[Dict[str, Any]]:
"""Search documents containing specific keywords"""
if self.legal_corpus is None:
self.load_legal_corpus()
results = []
keyword_lower = keyword.lower()
for law in self.legal_corpus:
law_id = law.get("law_id", "")
articles = law.get("articles", [])
for article in articles:
content = article.get("text", "").lower()
title = article.get("title", "").lower()
if keyword_lower in content or keyword_lower in title:
article_id = article.get("article_id", "")
combined_id = (
f"{law_id}_{article_id}"
if law_id and article_id
else article_id
)
results.append(
{
"law_id": law_id,
"article_id": article_id,
"title": article.get("title", ""),
"text": article.get("text", ""),
"combined_id": combined_id,
}
)
return results
|