GitHub Action commited on
Commit
811fb95
·
1 Parent(s): 146e7cb

Sync from GitHub with Git LFS

Browse files
Files changed (1) hide show
  1. scripts/AI_friendly.py +156 -0
scripts/AI_friendly.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ import yaml
5
+
6
+ ROOT_DIR = Path(".")
7
+ STRUCTURED_DIR = ROOT_DIR / "structured_md"
8
+ INDEX_FILE = STRUCTURED_DIR / "index.md"
9
+
10
+ MD_EXT = ".md"
11
+
12
+ # Шаблон JSON-LD для разных типов
13
+ JSON_LD_TEMPLATES = {
14
+ "FAQ": """\n```json
15
+ {{
16
+ "@context": "https://schema.org",
17
+ "@type": "FAQPage",
18
+ "mainEntity": {main_entity}
19
+ }}
20
+ ```\n""",
21
+ "HowTo": """\n```json
22
+ {{
23
+ "@context": "https://schema.org",
24
+ "@type": "HowTo",
25
+ "name": "{title}",
26
+ "description": "{description}",
27
+ "step": {steps}
28
+ }}
29
+ ```\n""",
30
+ "Article": """\n```json
31
+ {{
32
+ "@context": "https://schema.org",
33
+ "@type": "Article",
34
+ "name": "{title}",
35
+ "description": "{description}"
36
+ }}
37
+ ```\n"""
38
+ }
39
+
40
+ FRONT_MATTER_RE = re.compile(r"^---\n(.*?)\n---\n", re.DOTALL)
41
+
42
+ def is_md_file(path):
43
+ return path.suffix.lower() == MD_EXT and STRUCTURED_DIR not in path.parents
44
+
45
+ def parse_front_matter(content):
46
+ match = FRONT_MATTER_RE.match(content)
47
+ if match:
48
+ try:
49
+ data = yaml.safe_load(match.group(1))
50
+ return data
51
+ except Exception:
52
+ pass
53
+ return {}
54
+
55
+ def determine_type(content, front_matter):
56
+ if "type" in front_matter:
57
+ return front_matter["type"]
58
+ # Простейшее определение по ключевым словам в заголовках
59
+ if re.search(r"^#.*FAQ", content, re.MULTILINE):
60
+ return "FAQ"
61
+ if re.search(r"^#.*HowTo", content, re.MULTILINE):
62
+ return "HowTo"
63
+ return "Article"
64
+
65
+ def generate_json_ld(content, front_matter, ftype, title, rel_path):
66
+ desc = front_matter.get("description", content[:100].replace("\n", " ") + "...")
67
+ url = f"structured_md/{rel_path.as_posix()}"
68
+
69
+ if ftype == "FAQ":
70
+ q_matches = re.findall(r"^##\s*(.+)$", content, re.MULTILINE)
71
+ main_entity = []
72
+ for q in q_matches:
73
+ ans_match = re.search(rf"##\s*{re.escape(q)}\s*\n(.+?)(\n##|\Z)", content, re.DOTALL)
74
+ answer_text = ans_match.group(1).strip() if ans_match else ""
75
+ main_entity.append({
76
+ "@type": "Question",
77
+ "name": q,
78
+ "acceptedAnswer": {"@type": "Answer", "text": answer_text}
79
+ })
80
+ import json
81
+ return JSON_LD_TEMPLATES["FAQ"].format(
82
+ main_entity=json.dumps(main_entity, ensure_ascii=False, indent=2)
83
+ ).replace("}}", f',\n "url": "{url}"\n}}', 1)
84
+
85
+ elif ftype == "HowTo":
86
+ steps = [{"@type": "HowToStep", "name": s.strip()} for s in re.findall(r"^- (.+)$", content, re.MULTILINE)]
87
+ import json
88
+ return JSON_LD_TEMPLATES["HowTo"].format(
89
+ title=title, description=desc, steps=json.dumps(steps, ensure_ascii=False, indent=2)
90
+ ).replace("}}", f',\n "url": "{url}"\n}}', 1)
91
+
92
+ else: # Article
93
+ return JSON_LD_TEMPLATES["Article"].format(
94
+ title=title, description=desc
95
+ ).replace("}}", f',\n "url": "{url}"\n}}', 1)
96
+
97
+ def mirror_md_files():
98
+ files = []
99
+ for md_path in ROOT_DIR.rglob("*.md"):
100
+ if not is_md_file(md_path):
101
+ continue
102
+
103
+ rel_path = md_path.relative_to(ROOT_DIR)
104
+ target_path = STRUCTURED_DIR / rel_path
105
+
106
+ target_path.parent.mkdir(parents=True, exist_ok=True)
107
+
108
+ with open(md_path, "r", encoding="utf-8") as f:
109
+ content = f.read()
110
+
111
+ front_matter = parse_front_matter(content)
112
+ ftype = determine_type(content, front_matter)
113
+ title = front_matter.get("title", md_path.stem)
114
+
115
+ json_ld = generate_json_ld(content, front_matter, ftype, title)
116
+
117
+ with open(target_path, "w", encoding="utf-8") as f:
118
+ f.write(json_ld + content)
119
+
120
+ files.append(rel_path)
121
+ return files
122
+
123
+ def generate_index(files):
124
+ index_lines = ["# ИИ-дружелюбные версии файлов\n"]
125
+ tree = {}
126
+
127
+ for f in files:
128
+ parts = list(f.parts)
129
+ d = tree
130
+ for p in parts[:-1]:
131
+ d = d.setdefault(p, {})
132
+ d[parts[-1]] = None
133
+
134
+ def render_tree(d, parent_path="", level=0):
135
+ lines = []
136
+ for name, sub in sorted(d.items()):
137
+ indent = " " * level
138
+ full_path = Path(parent_path) / name
139
+ if sub is None:
140
+ lines.append(f"{indent}- [{name}]({full_path.as_posix()})")
141
+ else:
142
+ lines.append(f"{indent}- {name}")
143
+ lines.extend(render_tree(sub, full_path, level + 1))
144
+ return lines
145
+
146
+ index_lines.extend(render_tree(tree))
147
+
148
+ INDEX_FILE.parent.mkdir(parents=True, exist_ok=True)
149
+ with open(INDEX_FILE, "w", encoding="utf-8") as f:
150
+ f.write("\n".join(index_lines))
151
+
152
+ if __name__ == "__main__":
153
+ STRUCTURED_DIR.mkdir(exist_ok=True)
154
+ md_files = mirror_md_files()
155
+ generate_index(md_files)
156
+ print(f"Обработано {len(md_files)} файлов. Индекс создан: {INDEX_FILE}")