Spaces:
Sleeping
Sleeping
Module 4: Refine Crawler's logic
Browse files- Use a session only during crawling process instead of opening many requests
for better time usage.
- Use multithread to crawl document concurrently.
- Use pdfminer.six for faster pdf crawling (although this tool will not
conserve pdf format but keyword extracting is enough in this use case)
- Add playwright Stealth method to avoid Cloudflare's blockage.
- Baseline.ipynb +522 -283
Baseline.ipynb
CHANGED
|
@@ -1154,7 +1154,7 @@
|
|
| 1154 |
},
|
| 1155 |
{
|
| 1156 |
"cell_type": "code",
|
| 1157 |
-
"execution_count":
|
| 1158 |
"id": "4437641d",
|
| 1159 |
"metadata": {
|
| 1160 |
"colab": {
|
|
@@ -1261,7 +1261,7 @@
|
|
| 1261 |
},
|
| 1262 |
{
|
| 1263 |
"cell_type": "code",
|
| 1264 |
-
"execution_count":
|
| 1265 |
"id": "0jkUpzEPhFLT",
|
| 1266 |
"metadata": {
|
| 1267 |
"colab": {
|
|
@@ -1347,7 +1347,9 @@
|
|
| 1347 |
"cell_type": "code",
|
| 1348 |
"execution_count": null,
|
| 1349 |
"id": "18c52c1a",
|
| 1350 |
-
"metadata": {
|
|
|
|
|
|
|
| 1351 |
"outputs": [],
|
| 1352 |
"source": [
|
| 1353 |
"import os\n",
|
|
@@ -1499,7 +1501,7 @@
|
|
| 1499 |
" domain_bonus += 0.25\n",
|
| 1500 |
" elif any(d in domain for d in BAD_DOMAINS):\n",
|
| 1501 |
" penalty += 0.3\n",
|
| 1502 |
-
"
|
| 1503 |
" # Language bonus (phát hiện tiếng Việt)\n",
|
| 1504 |
" vietnamese_chars = re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', snippet)\n",
|
| 1505 |
" lang_bonus = 0.1 if len(vietnamese_chars) > 5 else -0.1 # trừ nếu snippet không phải tiếng Việt\n",
|
|
@@ -1570,7 +1572,10 @@
|
|
| 1570 |
"cell_type": "code",
|
| 1571 |
"execution_count": null,
|
| 1572 |
"id": "771734e4",
|
| 1573 |
-
"metadata": {
|
|
|
|
|
|
|
|
|
|
| 1574 |
"outputs": [
|
| 1575 |
{
|
| 1576 |
"name": "stdout",
|
|
@@ -1615,34 +1620,62 @@
|
|
| 1615 |
},
|
| 1616 |
{
|
| 1617 |
"cell_type": "code",
|
| 1618 |
-
"
|
| 1619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1620 |
"metadata": {
|
| 1621 |
"colab": {
|
| 1622 |
"base_uri": "https://localhost:8080/"
|
| 1623 |
},
|
| 1624 |
-
"id": "
|
| 1625 |
-
"outputId": "
|
| 1626 |
},
|
|
|
|
|
|
|
| 1627 |
"outputs": [
|
| 1628 |
{
|
| 1629 |
-
"name": "stdout",
|
| 1630 |
"output_type": "stream",
|
|
|
|
| 1631 |
"text": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1632 |
"Downloading Chromium 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip\u001b[22m\n",
|
| 1633 |
-
"\u001b[1G173.7 MiB [] 0% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 0%
|
| 1634 |
"Chromium 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium-1187\n",
|
| 1635 |
"Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-linux.zip\u001b[22m\n",
|
| 1636 |
-
"\u001b[1G104.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 0% 2.
|
| 1637 |
"Chromium Headless Shell 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium_headless_shell-1187\n",
|
| 1638 |
"Downloading Firefox 141.0 (playwright build v1490)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/firefox/1490/firefox-ubuntu-22.04.zip\u001b[22m\n",
|
| 1639 |
-
"\u001b[1G96 MiB [] 0% 0.0s\u001b[0K\u001b[1G96 MiB [] 0%
|
| 1640 |
"Firefox 141.0 (playwright build v1490) downloaded to /root/.cache/ms-playwright/firefox-1490\n",
|
| 1641 |
"Downloading Webkit 26.0 (playwright build v2203)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/webkit/2203/webkit-ubuntu-22.04.zip\u001b[22m\n",
|
| 1642 |
-
"\u001b[1G94.6 MiB [] 0% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 0%
|
| 1643 |
"Webkit 26.0 (playwright build v2203) downloaded to /root/.cache/ms-playwright/webkit-2203\n",
|
| 1644 |
"Downloading FFMPEG playwright build v1011\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/ffmpeg/1011/ffmpeg-linux.zip\u001b[22m\n",
|
| 1645 |
-
"\u001b[1G2.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 48% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1646 |
"FFMPEG playwright build v1011 downloaded to /root/.cache/ms-playwright/ffmpeg-1011\n",
|
| 1647 |
"Playwright Host validation warning: \n",
|
| 1648 |
"╔══════════════════════════════════════════════════════╗\n",
|
|
@@ -1659,33 +1692,30 @@
|
|
| 1659 |
"║ libmanette-0.2.so.0 ║\n",
|
| 1660 |
"╚══════════════════════════════════════════════════════╝\n",
|
| 1661 |
" at validateDependenciesLinux (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)\n",
|
| 1662 |
-
"\u001b[90m at process.processTicksAndRejections (node:internal/process/task_queues:105:5)\u001b[39m\n",
|
| 1663 |
" at async Registry._validateHostRequirements (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:934:14)\n",
|
| 1664 |
" at async Registry._validateHostRequirementsForExecutableIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1056:7)\n",
|
| 1665 |
" at async Registry.validateHostRequirementsForExecutablesIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1045:7)\n",
|
| 1666 |
" at async i.<anonymous> (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/cli/program.js:217:7)\n",
|
| 1667 |
"Installing dependencies...\n",
|
| 1668 |
"Hit:1 https://cli.github.com/packages stable InRelease\n",
|
| 1669 |
-
"
|
| 1670 |
-
"
|
| 1671 |
-
"
|
| 1672 |
-
"Get:5
|
| 1673 |
-
"Get:6
|
| 1674 |
-
"
|
| 1675 |
-
"
|
| 1676 |
-
"
|
| 1677 |
-
"Get:10
|
| 1678 |
-
"
|
| 1679 |
-
"Get:12
|
| 1680 |
-
"
|
| 1681 |
-
"Get:14 https://
|
| 1682 |
-
"Get:15 http://
|
| 1683 |
-
"Get:16
|
| 1684 |
-
"Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,
|
| 1685 |
-
"Get:18
|
| 1686 |
-
"
|
| 1687 |
-
"Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,276 kB]\n",
|
| 1688 |
-
"Fetched 28.4 MB in 5s (6,112 kB/s)\n",
|
| 1689 |
"Reading package lists... Done\n",
|
| 1690 |
"W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
|
| 1691 |
"Reading package lists... Done\n",
|
|
@@ -1844,7 +1874,7 @@
|
|
| 1844 |
" libwildmidi2 libwoff1 libxtst6 libyuv0 libzbar0 libzxingcore1\n",
|
| 1845 |
" session-migration timgm6mb-soundfont xfonts-cyrillic xfonts-encodings\n",
|
| 1846 |
" xfonts-scalable xfonts-utils\n",
|
| 1847 |
-
"0 upgraded, 94 newly installed, 0 to remove and
|
| 1848 |
"Need to get 48.2 MB of archives.\n",
|
| 1849 |
"After this operation, 123 MB of additional disk space will be used.\n",
|
| 1850 |
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-ipafont-gothic all 00303-21ubuntu1 [3,513 kB]\n",
|
|
@@ -1941,7 +1971,7 @@
|
|
| 1941 |
"Get:92 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-aacenc0 amd64 0.1.3-2 [69.4 kB]\n",
|
| 1942 |
"Get:93 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-amrwbenc0 amd64 0.1.3-2 [68.2 kB]\n",
|
| 1943 |
"Get:94 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 gstreamer1.0-plugins-bad amd64 1.20.3-0ubuntu1.1 [2,602 kB]\n",
|
| 1944 |
-
"Fetched 48.2 MB in
|
| 1945 |
"Extracting templates from packages: 100%\n",
|
| 1946 |
"Preconfiguring packages ...\n",
|
| 1947 |
"Selecting previously unselected package fonts-ipafont-gothic.\n",
|
|
@@ -2321,33 +2351,33 @@
|
|
| 2321 |
"Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...\n",
|
| 2322 |
"Processing triggers for libglib2.0-0:amd64 (2.72.4-0ubuntu2.6) ...\n",
|
| 2323 |
"Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
|
| 2324 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2325 |
"\n",
|
| 2326 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2327 |
"\n",
|
| 2328 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2329 |
"\n",
|
| 2330 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2331 |
"\n",
|
| 2332 |
"/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
|
| 2333 |
"\n",
|
| 2334 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2335 |
"\n",
|
| 2336 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
|
| 2337 |
"\n",
|
| 2338 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2339 |
"\n",
|
| 2340 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2341 |
"\n",
|
| 2342 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2343 |
"\n",
|
| 2344 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
|
| 2345 |
"\n",
|
| 2346 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2347 |
"\n",
|
| 2348 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2349 |
"\n",
|
| 2350 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2351 |
"\n",
|
| 2352 |
"Setting up glib-networking:amd64 (2.72.0-1) ...\n",
|
| 2353 |
"Setting up libsoup2.4-1:amd64 (2.74.2-3ubuntu0.6) ...\n",
|
|
@@ -2360,34 +2390,217 @@
|
|
| 2360 |
"Setting up gstreamer1.0-plugins-bad:amd64 (1.20.3-0ubuntu1.1) ...\n",
|
| 2361 |
"Processing triggers for dictionaries-common (1.28.14) ...\n",
|
| 2362 |
"Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
|
| 2363 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2364 |
"\n",
|
| 2365 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2366 |
"\n",
|
| 2367 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2368 |
"\n",
|
| 2369 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2370 |
"\n",
|
| 2371 |
"/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
|
| 2372 |
"\n",
|
| 2373 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2374 |
"\n",
|
| 2375 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
|
| 2376 |
"\n",
|
| 2377 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2378 |
"\n",
|
| 2379 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2380 |
"\n",
|
| 2381 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2382 |
"\n",
|
| 2383 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
|
| 2384 |
"\n",
|
|
|
|
|
|
|
| 2385 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
|
| 2386 |
"\n",
|
| 2387 |
-
"/sbin/ldconfig.real: /usr/local/lib/
|
| 2388 |
"\n",
|
| 2389 |
-
"
|
| 2390 |
-
"\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2391 |
]
|
| 2392 |
}
|
| 2393 |
],
|
|
@@ -2398,7 +2611,7 @@
|
|
| 2398 |
},
|
| 2399 |
{
|
| 2400 |
"cell_type": "code",
|
| 2401 |
-
"execution_count":
|
| 2402 |
"id": "uNgeNVFoMErV",
|
| 2403 |
"metadata": {
|
| 2404 |
"id": "uNgeNVFoMErV"
|
|
@@ -2406,48 +2619,46 @@
|
|
| 2406 |
"outputs": [],
|
| 2407 |
"source": [
|
| 2408 |
"import requests\n",
|
| 2409 |
-
"import
|
| 2410 |
-
"import
|
|
|
|
|
|
|
| 2411 |
]
|
| 2412 |
},
|
| 2413 |
{
|
| 2414 |
"cell_type": "code",
|
| 2415 |
-
"execution_count":
|
| 2416 |
"id": "tvAnpg8zMA08",
|
| 2417 |
"metadata": {
|
| 2418 |
"id": "tvAnpg8zMA08"
|
| 2419 |
},
|
| 2420 |
"outputs": [],
|
| 2421 |
"source": [
|
| 2422 |
-
"async def extract_text_from_pdf(url: str) -> str | None:\n",
|
| 2423 |
" \"\"\"\n",
|
| 2424 |
-
" Tải file PDF từ URL và trích xuất
|
| 2425 |
" \"\"\"\n",
|
| 2426 |
-
" print(f\" -> Detect PDF link. Handle by
|
| 2427 |
" try:\n",
|
| 2428 |
-
"
|
| 2429 |
-
"
|
| 2430 |
-
"
|
|
|
|
| 2431 |
"\n",
|
| 2432 |
-
"
|
| 2433 |
-
" with pdfplumber.open(io.BytesIO(response.content)) as pdf:\n",
|
| 2434 |
-
" full_text = []\n",
|
| 2435 |
-
" for page in pdf.pages:\n",
|
| 2436 |
-
" text = page.extract_text()\n",
|
| 2437 |
-
" if text:\n",
|
| 2438 |
-
" full_text.append(text)\n",
|
| 2439 |
"\n",
|
| 2440 |
-
"
|
| 2441 |
-
"
|
|
|
|
| 2442 |
"\n",
|
| 2443 |
" except Exception as e:\n",
|
| 2444 |
-
" print(f\"Error when open file PDF {url}: {e}\")\n",
|
| 2445 |
" return None"
|
| 2446 |
]
|
| 2447 |
},
|
| 2448 |
{
|
| 2449 |
"cell_type": "code",
|
| 2450 |
-
"execution_count":
|
| 2451 |
"id": "xSKWwAbIBwTu",
|
| 2452 |
"metadata": {
|
| 2453 |
"id": "xSKWwAbIBwTu"
|
|
@@ -2455,71 +2666,97 @@
|
|
| 2455 |
"outputs": [],
|
| 2456 |
"source": [
|
| 2457 |
"from playwright.async_api import async_playwright\n",
|
|
|
|
| 2458 |
"import trafilatura"
|
| 2459 |
]
|
| 2460 |
},
|
| 2461 |
{
|
| 2462 |
"cell_type": "code",
|
| 2463 |
-
"execution_count":
|
| 2464 |
"id": "6QF-79pKSBw1",
|
| 2465 |
"metadata": {
|
| 2466 |
"id": "6QF-79pKSBw1"
|
| 2467 |
},
|
| 2468 |
"outputs": [],
|
| 2469 |
"source": [
|
| 2470 |
-
"async def extract_text_from_web(url: str) -> str | None:\n",
|
| 2471 |
" \"\"\"\n",
|
| 2472 |
-
"
|
| 2473 |
-
"
|
| 2474 |
" \"\"\"\n",
|
|
|
|
| 2475 |
"\n",
|
| 2476 |
-
" print(f\" -> Detect web link. Handle by trafilatura: {url}\")\n",
|
| 2477 |
" try:\n",
|
| 2478 |
-
"
|
| 2479 |
-
"
|
| 2480 |
-
"
|
|
|
|
|
|
|
| 2481 |
"\n",
|
| 2482 |
-
"
|
|
|
|
|
|
|
|
|
|
| 2483 |
"\n",
|
| 2484 |
-
"
|
| 2485 |
-
" await browser.close()\n",
|
| 2486 |
"\n",
|
| 2487 |
-
"
|
| 2488 |
-
"
|
|
|
|
| 2489 |
"\n",
|
| 2490 |
-
"
|
| 2491 |
-
"
|
| 2492 |
-
"
|
|
|
|
| 2493 |
"\n",
|
| 2494 |
-
"
|
| 2495 |
-
"
|
| 2496 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2497 |
]
|
| 2498 |
},
|
| 2499 |
{
|
| 2500 |
"cell_type": "code",
|
| 2501 |
-
"execution_count":
|
| 2502 |
"id": "rL1vDTvHMwAj",
|
| 2503 |
"metadata": {
|
| 2504 |
"id": "rL1vDTvHMwAj"
|
| 2505 |
},
|
| 2506 |
"outputs": [],
|
| 2507 |
"source": [
|
| 2508 |
-
"async def fetch_content_from_url(url: str) -> str | None:\n",
|
| 2509 |
" \"\"\"\n",
|
| 2510 |
" Hàm điều phối: Kiểm tra loại URL và gọi hàm xử lý tương ứng.\n",
|
| 2511 |
" \"\"\"\n",
|
| 2512 |
-
" # Chuyển URL về chữ thường để kiểm tra đuôi file\n",
|
| 2513 |
" if url.lower().endswith('.pdf'):\n",
|
| 2514 |
-
" return await extract_text_from_pdf(url)\n",
|
| 2515 |
" else:\n",
|
| 2516 |
-
"
|
| 2517 |
-
" return await extract_text_from_web(url)"
|
| 2518 |
]
|
| 2519 |
},
|
| 2520 |
{
|
| 2521 |
"cell_type": "code",
|
| 2522 |
-
"execution_count":
|
| 2523 |
"id": "w3y0tq_pLIXu",
|
| 2524 |
"metadata": {
|
| 2525 |
"id": "w3y0tq_pLIXu"
|
|
@@ -2556,13 +2793,16 @@
|
|
| 2556 |
},
|
| 2557 |
{
|
| 2558 |
"cell_type": "code",
|
| 2559 |
-
"execution_count":
|
| 2560 |
"id": "0eJs0RfoBz5o",
|
| 2561 |
"metadata": {
|
| 2562 |
"id": "0eJs0RfoBz5o"
|
| 2563 |
},
|
| 2564 |
"outputs": [],
|
| 2565 |
"source": [
|
|
|
|
|
|
|
|
|
|
| 2566 |
"def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50) -> list[str]:\n",
|
| 2567 |
" \"\"\"Hàm tiện ích để chia văn bản dài thành các chunk nhỏ hơn.\"\"\"\n",
|
| 2568 |
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
|
@@ -2575,207 +2815,206 @@
|
|
| 2575 |
},
|
| 2576 |
{
|
| 2577 |
"cell_type": "code",
|
| 2578 |
-
"execution_count":
|
| 2579 |
-
"id": "
|
| 2580 |
"metadata": {
|
| 2581 |
-
"id": "
|
| 2582 |
},
|
| 2583 |
"outputs": [],
|
| 2584 |
"source": [
|
| 2585 |
-
"
|
| 2586 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2587 |
]
|
| 2588 |
},
|
| 2589 |
{
|
| 2590 |
"cell_type": "code",
|
| 2591 |
-
"
|
| 2592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2593 |
"metadata": {
|
| 2594 |
"colab": {
|
| 2595 |
"base_uri": "https://localhost:8080/"
|
| 2596 |
},
|
| 2597 |
-
"id": "
|
| 2598 |
-
"outputId": "
|
| 2599 |
},
|
|
|
|
|
|
|
| 2600 |
"outputs": [
|
| 2601 |
{
|
| 2602 |
-
"name": "stdout",
|
| 2603 |
"output_type": "stream",
|
|
|
|
| 2604 |
"text": [
|
| 2605 |
"\n",
|
| 2606 |
"==================================================\n",
|
| 2607 |
"Handle claim: 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '\n",
|
| 2608 |
-
"
|
| 2609 |
-
" ->
|
| 2610 |
-
" -> Detect web link. Handle by
|
| 2611 |
-
"
|
| 2612 |
-
"
|
| 2613 |
-
"\n",
|
| 2614 |
-
" ->
|
| 2615 |
-
" -> Detect web link. Handle by
|
| 2616 |
-
"
|
| 2617 |
-
"
|
| 2618 |
-
"\n",
|
| 2619 |
-
" ->
|
| 2620 |
-
" -> Detect web link. Handle by
|
| 2621 |
-
"
|
| 2622 |
-
"
|
| 2623 |
-
"
|
| 2624 |
-
"
|
| 2625 |
-
"
|
| 2626 |
-
"
|
| 2627 |
-
"
|
| 2628 |
-
"\n",
|
| 2629 |
-
"
|
| 2630 |
-
"
|
| 2631 |
-
"SUCCESS!
|
| 2632 |
-
"SUCCESS
|
| 2633 |
-
"
|
| 2634 |
-
"
|
| 2635 |
-
"
|
| 2636 |
-
"SUCCESS!
|
| 2637 |
-
"
|
| 2638 |
-
"
|
| 2639 |
-
"
|
| 2640 |
-
"
|
| 2641 |
-
"SUCCESS!
|
| 2642 |
-
"SUCCESS!!
|
| 2643 |
-
"\n",
|
| 2644 |
-
"
|
| 2645 |
-
"
|
| 2646 |
-
"SUCCESS
|
| 2647 |
-
"SUCCESS!!
|
| 2648 |
-
"\n",
|
| 2649 |
-
"
|
| 2650 |
-
"
|
| 2651 |
-
|
| 2652 |
-
|
| 2653 |
-
|
| 2654 |
-
|
| 2655 |
-
|
| 2656 |
-
|
| 2657 |
-
"
|
| 2658 |
-
|
| 2659 |
-
|
| 2660 |
-
|
| 2661 |
-
|
| 2662 |
-
|
| 2663 |
-
|
| 2664 |
-
"
|
| 2665 |
-
"
|
| 2666 |
-
"SUCCESS
|
| 2667 |
-
"
|
| 2668 |
-
"==> Finish for claim 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '. Total: 149 chunks.\n",
|
| 2669 |
"\n",
|
| 2670 |
"==================================================\n",
|
| 2671 |
"Handle claim: 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'\n",
|
| 2672 |
-
"
|
| 2673 |
-
" ->
|
| 2674 |
-
" -> Detect web link. Handle by
|
| 2675 |
-
"
|
| 2676 |
-
"
|
| 2677 |
-
"\n",
|
| 2678 |
-
" ->
|
| 2679 |
-
" -> Detect
|
| 2680 |
-
"
|
| 2681 |
-
"
|
| 2682 |
-
"\n",
|
| 2683 |
-
"
|
| 2684 |
-
"
|
| 2685 |
-
"SUCCESS!
|
| 2686 |
-
"
|
| 2687 |
-
"
|
| 2688 |
-
"
|
| 2689 |
-
"
|
| 2690 |
-
"SUCCESS!
|
| 2691 |
-
"SUCCESS
|
| 2692 |
-
"
|
| 2693 |
-
"
|
| 2694 |
-
" -> Detect web link. Handle by trafilatura: https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
|
| 2695 |
-
"SUCCESS! Context extract from Playwright ---\n",
|
| 2696 |
-
"SUCCESS!! Using full content\n",
|
| 2697 |
-
"\n",
|
| 2698 |
-
" -> Crawling: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
|
| 2699 |
-
" -> Detect PDF link. Handle by pdfplumber: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
|
| 2700 |
-
"SUCCESS! Context extract from Playwright ---\n",
|
| 2701 |
-
"SUCCESS!! Using full content\n",
|
| 2702 |
-
"\n",
|
| 2703 |
-
" -> Crawling: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
|
| 2704 |
-
" -> Detect PDF link. Handle by pdfplumber: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
|
| 2705 |
-
"SUCCESS! Context extract from Playwright ---\n",
|
| 2706 |
-
"SUCCESS!! Using full content\n",
|
| 2707 |
-
"\n",
|
| 2708 |
-
" -> Crawling: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
|
| 2709 |
-
" -> Detect web link. Handle by trafilatura: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
|
| 2710 |
-
"Error for using trafilatura for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: Page.goto: Timeout 90000ms exceeded.\n",
|
| 2711 |
"Call log:\n",
|
| 2712 |
-
" - navigating to \"
|
| 2713 |
-
"\n",
|
| 2714 |
-
"FAIL!! Using snippet as backup\n",
|
| 2715 |
-
"\n",
|
| 2716 |
-
" -> Crawling: https://loigiaihay.com/bai-tap-245461.html\n",
|
| 2717 |
-
" -> Detect web link. Handle by trafilatura: https://loigiaihay.com/bai-tap-245461.html\n",
|
| 2718 |
-
"SUCCESS! Context extract from Playwright ---\n",
|
| 2719 |
-
"SUCCESS!! Using full content\n",
|
| 2720 |
"\n",
|
| 2721 |
-
"
|
| 2722 |
-
"
|
| 2723 |
-
"Error
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2724 |
"Call log:\n",
|
| 2725 |
-
" - navigating to \"
|
| 2726 |
"\n",
|
| 2727 |
-
"
|
| 2728 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2729 |
]
|
| 2730 |
}
|
| 2731 |
-
],
|
| 2732 |
-
"source": [
|
| 2733 |
-
"# Tải dữ liệu từ file\n",
|
| 2734 |
-
"with open('document_retrieval_results.json', 'r', encoding='utf-8') as f:\n",
|
| 2735 |
-
" retrieved_data = json.load(f)\n",
|
| 2736 |
-
"\n",
|
| 2737 |
-
"# Tổ chức lại evidence theo từng claim\n",
|
| 2738 |
-
"evidence_by_claim = {}\n",
|
| 2739 |
-
"claims = list(retrieved_data.keys())\n",
|
| 2740 |
-
"\n",
|
| 2741 |
-
"for claim in claims:\n",
|
| 2742 |
-
" print(f\"\\n{'='*50}\\nHandle claim: '{claim}'\")\n",
|
| 2743 |
-
" documents = retrieved_data[claim]\n",
|
| 2744 |
-
" all_chunks_for_this_claim = []\n",
|
| 2745 |
-
"\n",
|
| 2746 |
-
" for doc in documents:\n",
|
| 2747 |
-
" # 1. Thử crawl để lấy nội dung đầy đủ\n",
|
| 2748 |
-
" print(f\"\\n -> Crawling: {doc['link']}\")\n",
|
| 2749 |
-
" full_content = await fetch_content_from_url(doc['link'])\n",
|
| 2750 |
-
"\n",
|
| 2751 |
-
" # 2. Chiến lược Fallback: Nếu crawl lỗi, dùng tạm snippet\n",
|
| 2752 |
-
" content_to_process = \"\"\n",
|
| 2753 |
-
" if full_content and len(full_content) > 100: # Kiểm tra nội dung có đáng kể không\n",
|
| 2754 |
-
" print(\"SUCCESS!! Using full content\")\n",
|
| 2755 |
-
" cleaned_full_content = clean_text(full_content)\n",
|
| 2756 |
-
" content_to_process = f\"{doc.get('title', '')}. {cleaned_full_content}\"\n",
|
| 2757 |
-
" else:\n",
|
| 2758 |
-
" print(\"FAIL!! Using snippet as backup\")\n",
|
| 2759 |
-
" cleaned_snippet = clean_text(doc.get('snippet', ''))\n",
|
| 2760 |
-
" content_to_process = f\"{doc.get('title', '')}. {cleaned_snippet}\"\n",
|
| 2761 |
-
"\n",
|
| 2762 |
-
" # 3. Bắt buộc: Chia nhỏ nội dung thành các chunks\n",
|
| 2763 |
-
" chunks = chunk_text(content_to_process)\n",
|
| 2764 |
-
"\n",
|
| 2765 |
-
" # 4. Lưu các chunks với cấu trúc dữ liệu nhất quán\n",
|
| 2766 |
-
" for chunk_text_part in chunks:\n",
|
| 2767 |
-
" all_chunks_for_this_claim.append({\n",
|
| 2768 |
-
" \"text\": chunk_text_part,\n",
|
| 2769 |
-
" \"link\": doc['link'] # Giữ lại nguồn của chunk\n",
|
| 2770 |
-
" })\n",
|
| 2771 |
-
"\n",
|
| 2772 |
-
" evidence_by_claim[claim] = all_chunks_for_this_claim\n",
|
| 2773 |
-
" print(f\"==> Finish for claim '{claim}'. Total: {len(all_chunks_for_this_claim)} chunks.\")"
|
| 2774 |
]
|
| 2775 |
},
|
| 2776 |
{
|
| 2777 |
"cell_type": "code",
|
| 2778 |
-
"execution_count":
|
| 2779 |
"id": "-CcEWKwlpLII",
|
| 2780 |
"metadata": {
|
| 2781 |
"id": "-CcEWKwlpLII"
|
|
@@ -2787,7 +3026,7 @@
|
|
| 2787 |
},
|
| 2788 |
{
|
| 2789 |
"cell_type": "code",
|
| 2790 |
-
"execution_count":
|
| 2791 |
"id": "F2Wl6CytHxXu",
|
| 2792 |
"metadata": {
|
| 2793 |
"colab": {
|
|
@@ -3203,7 +3442,7 @@
|
|
| 3203 |
},
|
| 3204 |
{
|
| 3205 |
"cell_type": "code",
|
| 3206 |
-
"execution_count":
|
| 3207 |
"id": "9J1Z1TzdOBfX",
|
| 3208 |
"metadata": {
|
| 3209 |
"id": "9J1Z1TzdOBfX"
|
|
@@ -3215,7 +3454,7 @@
|
|
| 3215 |
},
|
| 3216 |
{
|
| 3217 |
"cell_type": "code",
|
| 3218 |
-
"execution_count":
|
| 3219 |
"id": "qYo7yMI9H1Uc",
|
| 3220 |
"metadata": {
|
| 3221 |
"colab": {
|
|
@@ -3490,7 +3729,7 @@
|
|
| 3490 |
},
|
| 3491 |
{
|
| 3492 |
"cell_type": "code",
|
| 3493 |
-
"execution_count":
|
| 3494 |
"id": "AHMdGO0JOECE",
|
| 3495 |
"metadata": {
|
| 3496 |
"id": "AHMdGO0JOECE"
|
|
@@ -3503,7 +3742,7 @@
|
|
| 3503 |
},
|
| 3504 |
{
|
| 3505 |
"cell_type": "code",
|
| 3506 |
-
"execution_count":
|
| 3507 |
"id": "B3CSkIO6FqEz",
|
| 3508 |
"metadata": {
|
| 3509 |
"colab": {
|
|
@@ -3701,7 +3940,7 @@
|
|
| 3701 |
},
|
| 3702 |
{
|
| 3703 |
"cell_type": "code",
|
| 3704 |
-
"execution_count":
|
| 3705 |
"id": "kBYXeNpdIRdt",
|
| 3706 |
"metadata": {
|
| 3707 |
"colab": {
|
|
@@ -12014,4 +12253,4 @@
|
|
| 12014 |
},
|
| 12015 |
"nbformat": 4,
|
| 12016 |
"nbformat_minor": 5
|
| 12017 |
-
}
|
|
|
|
| 1154 |
},
|
| 1155 |
{
|
| 1156 |
"cell_type": "code",
|
| 1157 |
+
"execution_count": null,
|
| 1158 |
"id": "4437641d",
|
| 1159 |
"metadata": {
|
| 1160 |
"colab": {
|
|
|
|
| 1261 |
},
|
| 1262 |
{
|
| 1263 |
"cell_type": "code",
|
| 1264 |
+
"execution_count": null,
|
| 1265 |
"id": "0jkUpzEPhFLT",
|
| 1266 |
"metadata": {
|
| 1267 |
"colab": {
|
|
|
|
| 1347 |
"cell_type": "code",
|
| 1348 |
"execution_count": null,
|
| 1349 |
"id": "18c52c1a",
|
| 1350 |
+
"metadata": {
|
| 1351 |
+
"id": "18c52c1a"
|
| 1352 |
+
},
|
| 1353 |
"outputs": [],
|
| 1354 |
"source": [
|
| 1355 |
"import os\n",
|
|
|
|
| 1501 |
" domain_bonus += 0.25\n",
|
| 1502 |
" elif any(d in domain for d in BAD_DOMAINS):\n",
|
| 1503 |
" penalty += 0.3\n",
|
| 1504 |
+
"\n",
|
| 1505 |
" # Language bonus (phát hiện tiếng Việt)\n",
|
| 1506 |
" vietnamese_chars = re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', snippet)\n",
|
| 1507 |
" lang_bonus = 0.1 if len(vietnamese_chars) > 5 else -0.1 # trừ nếu snippet không phải tiếng Việt\n",
|
|
|
|
| 1572 |
"cell_type": "code",
|
| 1573 |
"execution_count": null,
|
| 1574 |
"id": "771734e4",
|
| 1575 |
+
"metadata": {
|
| 1576 |
+
"id": "771734e4",
|
| 1577 |
+
"outputId": "92cc3e56-0a5a-4626-c90f-ed39ac6c5a66"
|
| 1578 |
+
},
|
| 1579 |
"outputs": [
|
| 1580 |
{
|
| 1581 |
"name": "stdout",
|
|
|
|
| 1620 |
},
|
| 1621 |
{
|
| 1622 |
"cell_type": "code",
|
| 1623 |
+
"source": [
|
| 1624 |
+
"!pip install playwright\n",
|
| 1625 |
+
"!pip install playwright-stealth\n",
|
| 1626 |
+
"!playwright install\n",
|
| 1627 |
+
"!playwright install-deps\n",
|
| 1628 |
+
"!pip install pdfplumber\n",
|
| 1629 |
+
"!pip install trafilatura"
|
| 1630 |
+
],
|
| 1631 |
"metadata": {
|
| 1632 |
"colab": {
|
| 1633 |
"base_uri": "https://localhost:8080/"
|
| 1634 |
},
|
| 1635 |
+
"id": "IAxlvGzQULAZ",
|
| 1636 |
+
"outputId": "07b66062-0c78-4130-c75e-4a3132ff705d"
|
| 1637 |
},
|
| 1638 |
+
"id": "IAxlvGzQULAZ",
|
| 1639 |
+
"execution_count": 2,
|
| 1640 |
"outputs": [
|
| 1641 |
{
|
|
|
|
| 1642 |
"output_type": "stream",
|
| 1643 |
+
"name": "stdout",
|
| 1644 |
"text": [
|
| 1645 |
+
"Collecting playwright\n",
|
| 1646 |
+
" Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)\n",
|
| 1647 |
+
"Collecting pyee<14,>=13 (from playwright)\n",
|
| 1648 |
+
" Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)\n",
|
| 1649 |
+
"Requirement already satisfied: greenlet<4.0.0,>=3.1.1 in /usr/local/lib/python3.12/dist-packages (from playwright) (3.2.4)\n",
|
| 1650 |
+
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from pyee<14,>=13->playwright) (4.15.0)\n",
|
| 1651 |
+
"Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)\n",
|
| 1652 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.9/45.9 MB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 1653 |
+
"\u001b[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)\n",
|
| 1654 |
+
"Installing collected packages: pyee, playwright\n",
|
| 1655 |
+
"Successfully installed playwright-1.55.0 pyee-13.0.0\n",
|
| 1656 |
+
"Collecting playwright-stealth\n",
|
| 1657 |
+
" Downloading playwright_stealth-2.0.0-py3-none-any.whl.metadata (4.0 kB)\n",
|
| 1658 |
+
"Requirement already satisfied: playwright<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from playwright-stealth) (1.55.0)\n",
|
| 1659 |
+
"Requirement already satisfied: pyee<14,>=13 in /usr/local/lib/python3.12/dist-packages (from playwright<2.0.0,>=1.0.0->playwright-stealth) (13.0.0)\n",
|
| 1660 |
+
"Requirement already satisfied: greenlet<4.0.0,>=3.1.1 in /usr/local/lib/python3.12/dist-packages (from playwright<2.0.0,>=1.0.0->playwright-stealth) (3.2.4)\n",
|
| 1661 |
+
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from pyee<14,>=13->playwright<2.0.0,>=1.0.0->playwright-stealth) (4.15.0)\n",
|
| 1662 |
+
"Downloading playwright_stealth-2.0.0-py3-none-any.whl (32 kB)\n",
|
| 1663 |
+
"Installing collected packages: playwright-stealth\n",
|
| 1664 |
+
"Successfully installed playwright-stealth-2.0.0\n",
|
| 1665 |
"Downloading Chromium 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip\u001b[22m\n",
|
| 1666 |
+
"\u001b[1G173.7 MiB [] 0% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 0% 9.7s\u001b[0K\u001b[1G173.7 MiB [] 0% 5.8s\u001b[0K\u001b[1G173.7 MiB [] 0% 5.6s\u001b[0K\u001b[1G173.7 MiB [] 1% 6.2s\u001b[0K\u001b[1G173.7 MiB [] 1% 4.2s\u001b[0K\u001b[1G173.7 MiB [] 3% 3.1s\u001b[0K\u001b[1G173.7 MiB [] 4% 2.5s\u001b[0K\u001b[1G173.7 MiB [] 5% 2.4s\u001b[0K\u001b[1G173.7 MiB [] 6% 2.3s\u001b[0K\u001b[1G173.7 MiB [] 7% 2.1s\u001b[0K\u001b[1G173.7 MiB [] 8% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 9% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 10% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 11% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 12% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 14% 1.7s\u001b[0K\u001b[1G173.7 MiB [] 15% 1.6s\u001b[0K\u001b[1G173.7 MiB [] 17% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 18% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 19% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 23% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 25% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 26% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 29% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 30% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 31% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 33% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 34% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 35% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 36% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 37% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 39% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 40% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 42% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 44% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 46% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 48% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 49% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 51% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 52% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 54% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 55% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 56% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 57% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 58% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 59% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 60% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 61% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 62% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 63% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 65% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 66% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 67% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 69% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 71% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 73% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 74% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 76% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 78% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 80% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 81% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 83% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 85% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 86% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 87% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 89% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 90% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 92% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 94% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 95% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 97% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 99% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1667 |
"Chromium 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium-1187\n",
|
| 1668 |
"Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-linux.zip\u001b[22m\n",
|
| 1669 |
+
"\u001b[1G104.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 0% 5.7s\u001b[0K\u001b[1G104.3 MiB [] 1% 2.1s\u001b[0K\u001b[1G104.3 MiB [] 2% 1.7s\u001b[0K\u001b[1G104.3 MiB [] 3% 1.6s\u001b[0K\u001b[1G104.3 MiB [] 5% 1.3s\u001b[0K\u001b[1G104.3 MiB [] 7% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 9% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 11% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 12% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 15% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 16% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 18% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 19% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 22% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 24% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 26% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 27% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 28% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 29% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 31% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 32% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 34% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 36% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 37% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 40% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 42% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 44% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 46% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 47% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 48% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 49% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 50% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 51% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 52% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 53% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 54% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 56% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 58% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 60% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 63% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 65% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 67% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 69% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 71% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 73% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 74% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 77% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 79% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 81% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 84% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 86% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 88% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 90% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 91% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 93% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 95% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 98% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1670 |
"Chromium Headless Shell 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium_headless_shell-1187\n",
|
| 1671 |
"Downloading Firefox 141.0 (playwright build v1490)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/firefox/1490/firefox-ubuntu-22.04.zip\u001b[22m\n",
|
| 1672 |
+
"\u001b[1G96 MiB [] 0% 0.0s\u001b[0K\u001b[1G96 MiB [] 0% 5.0s\u001b[0K\u001b[1G96 MiB [] 1% 2.4s\u001b[0K\u001b[1G96 MiB [] 2% 1.7s\u001b[0K\u001b[1G96 MiB [] 4% 1.3s\u001b[0K\u001b[1G96 MiB [] 6% 1.2s\u001b[0K\u001b[1G96 MiB [] 7% 1.1s\u001b[0K\u001b[1G96 MiB [] 9% 1.1s\u001b[0K\u001b[1G96 MiB [] 10% 1.2s\u001b[0K\u001b[1G96 MiB [] 11% 1.1s\u001b[0K\u001b[1G96 MiB [] 12% 1.1s\u001b[0K\u001b[1G96 MiB [] 14% 1.1s\u001b[0K\u001b[1G96 MiB [] 16% 1.0s\u001b[0K\u001b[1G96 MiB [] 18% 1.0s\u001b[0K\u001b[1G96 MiB [] 19% 0.9s\u001b[0K\u001b[1G96 MiB [] 21% 0.9s\u001b[0K\u001b[1G96 MiB [] 23% 0.9s\u001b[0K\u001b[1G96 MiB [] 25% 0.8s\u001b[0K\u001b[1G96 MiB [] 28% 0.7s\u001b[0K\u001b[1G96 MiB [] 30% 0.7s\u001b[0K\u001b[1G96 MiB [] 33% 0.7s\u001b[0K\u001b[1G96 MiB [] 35% 0.6s\u001b[0K\u001b[1G96 MiB [] 36% 0.6s\u001b[0K\u001b[1G96 MiB [] 36% 0.7s\u001b[0K\u001b[1G96 MiB [] 38% 0.7s\u001b[0K\u001b[1G96 MiB [] 39% 0.6s\u001b[0K\u001b[1G96 MiB [] 40% 0.6s\u001b[0K\u001b[1G96 MiB [] 43% 0.6s\u001b[0K\u001b[1G96 MiB [] 46% 0.5s\u001b[0K\u001b[1G96 MiB [] 47% 0.5s\u001b[0K\u001b[1G96 MiB [] 48% 0.6s\u001b[0K\u001b[1G96 MiB [] 50% 0.5s\u001b[0K\u001b[1G96 MiB [] 50% 0.6s\u001b[0K\u001b[1G96 MiB [] 52% 0.5s\u001b[0K\u001b[1G96 MiB [] 53% 0.5s\u001b[0K\u001b[1G96 MiB [] 55% 0.5s\u001b[0K\u001b[1G96 MiB [] 58% 0.5s\u001b[0K\u001b[1G96 MiB [] 61% 0.4s\u001b[0K\u001b[1G96 MiB [] 63% 0.4s\u001b[0K\u001b[1G96 MiB [] 66% 0.3s\u001b[0K\u001b[1G96 MiB [] 69% 0.3s\u001b[0K\u001b[1G96 MiB [] 71% 0.3s\u001b[0K\u001b[1G96 MiB [] 74% 0.3s\u001b[0K\u001b[1G96 MiB [] 77% 0.2s\u001b[0K\u001b[1G96 MiB [] 79% 0.2s\u001b[0K\u001b[1G96 MiB [] 82% 0.2s\u001b[0K\u001b[1G96 MiB [] 86% 0.1s\u001b[0K\u001b[1G96 MiB [] 87% 0.2s\u001b[0K\u001b[1G96 MiB [] 88% 0.2s\u001b[0K\u001b[1G96 MiB [] 89% 0.2s\u001b[0K\u001b[1G96 MiB [] 89% 0.3s\u001b[0K\u001b[1G96 MiB [] 90% 0.3s\u001b[0K\u001b[1G96 MiB [] 91% 0.3s\u001b[0K\u001b[1G96 MiB [] 95% 0.1s\u001b[0K\u001b[1G96 MiB [] 96% 0.1s\u001b[0K\u001b[1G96 MiB [] 97% 0.1s\u001b[0K\u001b[1G96 MiB [] 98% 0.0s\u001b[0K\u001b[1G96 MiB [] 99% 0.0s\u001b[0K\u001b[1G96 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1673 |
"Firefox 141.0 (playwright build v1490) downloaded to /root/.cache/ms-playwright/firefox-1490\n",
|
| 1674 |
"Downloading Webkit 26.0 (playwright build v2203)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/webkit/2203/webkit-ubuntu-22.04.zip\u001b[22m\n",
|
| 1675 |
+
"\u001b[1G94.6 MiB [] 0% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 0% 23.5s\u001b[0K\u001b[1G94.6 MiB [] 0% 22.1s\u001b[0K\u001b[1G94.6 MiB [] 0% 13.5s\u001b[0K\u001b[1G94.6 MiB [] 0% 12.0s\u001b[0K\u001b[1G94.6 MiB [] 0% 11.6s\u001b[0K\u001b[1G94.6 MiB [] 1% 10.0s\u001b[0K\u001b[1G94.6 MiB [] 1% 9.0s\u001b[0K\u001b[1G94.6 MiB [] 1% 8.4s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.9s\u001b[0K\u001b[1G94.6 MiB [] 2% 8.0s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.6s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.1s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.9s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.5s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.6s\u001b[0K\u001b[1G94.6 MiB [] 4% 6.7s\u001b[0K\u001b[1G94.6 MiB [] 5% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 5% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 6% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 6% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 7% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 7% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 9% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 9% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 10% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 10% 5.8s\u001b[0K\u001b[1G94.6 MiB [] 10% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 11% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 11% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 12% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 13% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 13% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 14% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 15% 5.8s\u001b[0K\u001b[1G94.6 MiB [] 15% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 16% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 18% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 18% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 19% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 19% 5.3s\u001b[0K\u001b[1G94.6 MiB [] 20% 5.3s\u001b[0K\u001b[1G94.6 MiB [] 20% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 21% 5.2s\u001b[0K\u001b[1G94.6 MiB [] 21% 5.1s\u001b[0K\u001b[1G94.6 MiB [] 22% 5.0s\u001b[0K\u001b[1G94.6 MiB [] 23% 4.8s\u001b[0K\u001b[1G94.6 MiB [] 23% 4.7s\u001b[0K\u001b[1G94.6 MiB [] 24% 4.6s\u001b[0K\u001b[1G94.6 MiB [] 25% 4.5s\u001b[0K\u001b[1G94.6 MiB [] 26% 4.4s\u001b[0K\u001b[1G94.6 MiB [] 26% 4.3s\u001b[0K\u001b[1G94.6 MiB [] 27% 4.3s\u001b[0K\u001b[1G94.6 MiB [] 28% 4.2s\u001b[0K\u001b[1G94.6 MiB [] 29% 4.1s\u001b[0K\u001b[1G94.6 MiB [] 29% 4.0s\u001b[0K\u001b[1G94.6 MiB [] 30% 4.0s\u001b[0K\u001b[1G94.6 MiB [] 31% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 32% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 33% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 33% 3.8s\u001b[0K\u001b[1G94.6 MiB [] 34% 3.8s\u001b[0K\u001b[1G94.6 MiB [] 35% 3.7s\u001b[0K\u001b[1G94.6 MiB [] 36% 3.6s\u001b[0K\u001b[1G94.6 MiB [] 36% 3.5s\u001b[0K\u001b[1G94.6 MiB [] 37% 3.5s\u001b[0K\u001b[1G94.6 MiB [] 38% 3.4s\u001b[0K\u001b[1G94.6 MiB [] 39% 3.4s\u001b[0K\u001b[1G94.6 MiB [] 39% 3.3s\u001b[0K\u001b[1G94.6 MiB [] 40% 3.3s\u001b[0K\u001b[1G94.6 MiB [] 40% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 41% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 42% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 42% 3.1s\u001b[0K\u001b[1G94.6 MiB [] 43% 3.0s\u001b[0K\u001b[1G94.6 MiB [] 44% 2.9s\u001b[0K\u001b[1G94.6 MiB [] 45% 2.8s\u001b[0K\u001b[1G94.6 MiB [] 46% 2.8s\u001b[0K\u001b[1G94.6 MiB [] 47% 2.7s\u001b[0K\u001b[1G94.6 MiB [] 48% 2.7s\u001b[0K\u001b[1G94.6 MiB [] 48% 2.6s\u001b[0K\u001b[1G94.6 MiB [] 49% 2.6s\u001b[0K\u001b[1G94.6 MiB [] 50% 2.5s\u001b[0K\u001b[1G94.6 MiB [] 51% 2.5s\u001b[0K\u001b[1G94.6 MiB [] 52% 2.4s\u001b[0K\u001b[1G94.6 MiB [] 53% 2.3s\u001b[0K\u001b[1G94.6 MiB [] 54% 2.2s\u001b[0K\u001b[1G94.6 MiB [] 55% 2.2s\u001b[0K\u001b[1G94.6 MiB [] 56% 2.1s\u001b[0K\u001b[1G94.6 MiB [] 57% 2.1s\u001b[0K\u001b[1G94.6 MiB [] 58% 2.0s\u001b[0K\u001b[1G94.6 MiB [] 59% 1.9s\u001b[0K\u001b[1G94.6 MiB [] 60% 1.8s\u001b[0K\u001b[1G94.6 MiB [] 61% 1.8s\u001b[0K\u001b[1G94.6 MiB [] 62% 1.7s\u001b[0K\u001b[1G94.6 MiB [] 63% 1.7s\u001b[0K\u001b[1G94.6 MiB [] 64% 1.6s\u001b[0K\u001b[1G94.6 MiB [] 65% 1.5s\u001b[0K\u001b[1G94.6 MiB [] 67% 1.5s\u001b[0K\u001b[1G94.6 MiB [] 67% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 68% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 69% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 69% 1.3s\u001b[0K\u001b[1G94.6 MiB [] 70% 1.3s\u001b[0K\u001b[1G94.6 MiB [] 71% 1.2s\u001b[0K\u001b[1G94.6 MiB [] 72% 1.2s\u001b[0K\u001b[1G94.6 MiB [] 73% 1.1s\u001b[0K\u001b[1G94.6 MiB [] 74% 1.1s\u001b[0K\u001b[1G94.6 MiB [] 75% 1.0s\u001b[0K\u001b[1G94.6 MiB [] 76% 1.0s\u001b[0K\u001b[1G94.6 MiB [] 77% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 78% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 79% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 80% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 81% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 82% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 83% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 84% 0.6s\u001b[0K\u001b[1G94.6 MiB [] 86% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 87% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 88% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 89% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 90% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 91% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 92% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 94% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 95% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 97% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 98% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 99% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1676 |
"Webkit 26.0 (playwright build v2203) downloaded to /root/.cache/ms-playwright/webkit-2203\n",
|
| 1677 |
"Downloading FFMPEG playwright build v1011\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/ffmpeg/1011/ffmpeg-linux.zip\u001b[22m\n",
|
| 1678 |
+
"\u001b[1G2.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 11% 0.1s\u001b[0K\u001b[1G2.3 MiB [] 48% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 90% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 100% 0.0s\u001b[0K\n",
|
| 1679 |
"FFMPEG playwright build v1011 downloaded to /root/.cache/ms-playwright/ffmpeg-1011\n",
|
| 1680 |
"Playwright Host validation warning: \n",
|
| 1681 |
"╔══════════════════════════════════════════════════════╗\n",
|
|
|
|
| 1692 |
"║ libmanette-0.2.so.0 ║\n",
|
| 1693 |
"╚══════════════════════════════════════════════════════╝\n",
|
| 1694 |
" at validateDependenciesLinux (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)\n",
|
|
|
|
| 1695 |
" at async Registry._validateHostRequirements (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:934:14)\n",
|
| 1696 |
" at async Registry._validateHostRequirementsForExecutableIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1056:7)\n",
|
| 1697 |
" at async Registry.validateHostRequirementsForExecutablesIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1045:7)\n",
|
| 1698 |
" at async i.<anonymous> (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/cli/program.js:217:7)\n",
|
| 1699 |
"Installing dependencies...\n",
|
| 1700 |
"Hit:1 https://cli.github.com/packages stable InRelease\n",
|
| 1701 |
+
"Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
|
| 1702 |
+
"Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
|
| 1703 |
+
"Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
|
| 1704 |
+
"Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n",
|
| 1705 |
+
"Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n",
|
| 1706 |
+
"Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
|
| 1707 |
+
"Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
|
| 1708 |
+
"Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
|
| 1709 |
+
"Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,287 kB]\n",
|
| 1710 |
+
"Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
|
| 1711 |
+
"Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,812 kB]\n",
|
| 1712 |
+
"Get:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
|
| 1713 |
+
"Get:14 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,372 kB]\n",
|
| 1714 |
+
"Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,778 kB]\n",
|
| 1715 |
+
"Get:16 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [69.2 kB]\n",
|
| 1716 |
+
"Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,594 kB]\n",
|
| 1717 |
+
"Get:18 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [5,988 kB]\n",
|
| 1718 |
+
"Fetched 25.3 MB in 3s (9,111 kB/s)\n",
|
|
|
|
|
|
|
| 1719 |
"Reading package lists... Done\n",
|
| 1720 |
"W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
|
| 1721 |
"Reading package lists... Done\n",
|
|
|
|
| 1874 |
" libwildmidi2 libwoff1 libxtst6 libyuv0 libzbar0 libzxingcore1\n",
|
| 1875 |
" session-migration timgm6mb-soundfont xfonts-cyrillic xfonts-encodings\n",
|
| 1876 |
" xfonts-scalable xfonts-utils\n",
|
| 1877 |
+
"0 upgraded, 94 newly installed, 0 to remove and 40 not upgraded.\n",
|
| 1878 |
"Need to get 48.2 MB of archives.\n",
|
| 1879 |
"After this operation, 123 MB of additional disk space will be used.\n",
|
| 1880 |
"Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-ipafont-gothic all 00303-21ubuntu1 [3,513 kB]\n",
|
|
|
|
| 1971 |
"Get:92 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-aacenc0 amd64 0.1.3-2 [69.4 kB]\n",
|
| 1972 |
"Get:93 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-amrwbenc0 amd64 0.1.3-2 [68.2 kB]\n",
|
| 1973 |
"Get:94 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 gstreamer1.0-plugins-bad amd64 1.20.3-0ubuntu1.1 [2,602 kB]\n",
|
| 1974 |
+
"Fetched 48.2 MB in 5s (10.3 MB/s)\n",
|
| 1975 |
"Extracting templates from packages: 100%\n",
|
| 1976 |
"Preconfiguring packages ...\n",
|
| 1977 |
"Selecting previously unselected package fonts-ipafont-gothic.\n",
|
|
|
|
| 2351 |
"Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...\n",
|
| 2352 |
"Processing triggers for libglib2.0-0:amd64 (2.72.4-0ubuntu2.6) ...\n",
|
| 2353 |
"Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
|
| 2354 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
|
| 2355 |
"\n",
|
| 2356 |
+
"/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
|
| 2357 |
"\n",
|
| 2358 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
|
| 2359 |
"\n",
|
| 2360 |
+
"/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
|
| 2361 |
"\n",
|
| 2362 |
"/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
|
| 2363 |
"\n",
|
| 2364 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
|
| 2365 |
"\n",
|
| 2366 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
|
| 2367 |
"\n",
|
| 2368 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
|
| 2369 |
"\n",
|
| 2370 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
|
| 2371 |
"\n",
|
| 2372 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
|
| 2373 |
"\n",
|
| 2374 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
|
| 2375 |
"\n",
|
| 2376 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
|
| 2377 |
"\n",
|
| 2378 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
|
| 2379 |
"\n",
|
| 2380 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
|
| 2381 |
"\n",
|
| 2382 |
"Setting up glib-networking:amd64 (2.72.0-1) ...\n",
|
| 2383 |
"Setting up libsoup2.4-1:amd64 (2.74.2-3ubuntu0.6) ...\n",
|
|
|
|
| 2390 |
"Setting up gstreamer1.0-plugins-bad:amd64 (1.20.3-0ubuntu1.1) ...\n",
|
| 2391 |
"Processing triggers for dictionaries-common (1.28.14) ...\n",
|
| 2392 |
"Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
|
| 2393 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
|
| 2394 |
"\n",
|
| 2395 |
+
"/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
|
| 2396 |
"\n",
|
| 2397 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
|
| 2398 |
"\n",
|
| 2399 |
+
"/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
|
| 2400 |
"\n",
|
| 2401 |
"/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
|
| 2402 |
"\n",
|
| 2403 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
|
| 2404 |
"\n",
|
| 2405 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
|
| 2406 |
"\n",
|
| 2407 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
|
| 2408 |
"\n",
|
| 2409 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
|
| 2410 |
"\n",
|
| 2411 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
|
| 2412 |
"\n",
|
| 2413 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
|
| 2414 |
"\n",
|
| 2415 |
+
"/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
|
| 2416 |
+
"\n",
|
| 2417 |
"/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
|
| 2418 |
"\n",
|
| 2419 |
+
"/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
|
| 2420 |
"\n",
|
| 2421 |
+
"Collecting pdfplumber\n",
|
| 2422 |
+
" Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
|
| 2423 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2424 |
+
"\u001b[?25hCollecting pdfminer.six==20250506 (from pdfplumber)\n",
|
| 2425 |
+
" Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
|
| 2426 |
+
"Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
|
| 2427 |
+
"Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
|
| 2428 |
+
" Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
|
| 2429 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2430 |
+
"\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.4)\n",
|
| 2431 |
+
"Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
|
| 2432 |
+
"Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.0.0)\n",
|
| 2433 |
+
"Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.23)\n",
|
| 2434 |
+
"Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
|
| 2435 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2436 |
+
"\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
|
| 2437 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m88.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2438 |
+
"\u001b[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
|
| 2439 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m120.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2440 |
+
"\u001b[?25hInstalling collected packages: pypdfium2, pdfminer.six, pdfplumber\n",
|
| 2441 |
+
"Successfully installed pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0\n",
|
| 2442 |
+
"Collecting trafilatura\n",
|
| 2443 |
+
" Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)\n",
|
| 2444 |
+
"Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from trafilatura) (2025.10.5)\n",
|
| 2445 |
+
"Requirement already satisfied: charset_normalizer>=3.4.0 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (3.4.4)\n",
|
| 2446 |
+
"Collecting courlan>=1.3.2 (from trafilatura)\n",
|
| 2447 |
+
" Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)\n",
|
| 2448 |
+
"Collecting htmldate>=1.9.2 (from trafilatura)\n",
|
| 2449 |
+
" Downloading htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)\n",
|
| 2450 |
+
"Collecting justext>=3.0.1 (from trafilatura)\n",
|
| 2451 |
+
" Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)\n",
|
| 2452 |
+
"Requirement already satisfied: lxml>=5.3.0 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (5.4.0)\n",
|
| 2453 |
+
"Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (2.5.0)\n",
|
| 2454 |
+
"Requirement already satisfied: babel>=2.16.0 in /usr/local/lib/python3.12/dist-packages (from courlan>=1.3.2->trafilatura) (2.17.0)\n",
|
| 2455 |
+
"Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)\n",
|
| 2456 |
+
" Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)\n",
|
| 2457 |
+
"Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)\n",
|
| 2458 |
+
" Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)\n",
|
| 2459 |
+
"Requirement already satisfied: python-dateutil>=2.9.0.post0 in /usr/local/lib/python3.12/dist-packages (from htmldate>=1.9.2->trafilatura) (2.9.0.post0)\n",
|
| 2460 |
+
"Requirement already satisfied: pytz>=2024.2 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (2025.2)\n",
|
| 2461 |
+
"Requirement already satisfied: regex>=2024.9.11 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (2024.11.6)\n",
|
| 2462 |
+
"Requirement already satisfied: tzlocal>=0.2 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (5.3.1)\n",
|
| 2463 |
+
"Collecting lxml_html_clean (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura)\n",
|
| 2464 |
+
" Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)\n",
|
| 2465 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.9.0.post0->htmldate>=1.9.2->trafilatura) (1.17.0)\n",
|
| 2466 |
+
"Downloading trafilatura-2.0.0-py3-none-any.whl (132 kB)\n",
|
| 2467 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.6/132.6 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2468 |
+
"\u001b[?25hDownloading courlan-1.3.2-py3-none-any.whl (33 kB)\n",
|
| 2469 |
+
"Downloading htmldate-1.9.3-py3-none-any.whl (31 kB)\n",
|
| 2470 |
+
"Downloading justext-3.0.2-py2.py3-none-any.whl (837 kB)\n",
|
| 2471 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m837.9/837.9 kB\u001b[0m \u001b[31m32.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2472 |
+
"\u001b[?25hDownloading dateparser-1.2.2-py3-none-any.whl (315 kB)\n",
|
| 2473 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m315.5/315.5 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2474 |
+
"\u001b[?25hDownloading tld-0.13.1-py2.py3-none-any.whl (274 kB)\n",
|
| 2475 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
| 2476 |
+
"\u001b[?25hDownloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)\n",
|
| 2477 |
+
"Installing collected packages: tld, lxml_html_clean, dateparser, courlan, justext, htmldate, trafilatura\n",
|
| 2478 |
+
"Successfully installed courlan-1.3.2 dateparser-1.2.2 htmldate-1.9.3 justext-3.0.2 lxml_html_clean-0.4.3 tld-0.13.1 trafilatura-2.0.0\n"
|
| 2479 |
+
]
|
| 2480 |
+
}
|
| 2481 |
+
]
|
| 2482 |
+
},
|
| 2483 |
+
{
|
| 2484 |
+
"cell_type": "code",
|
| 2485 |
+
"execution_count": 3,
|
| 2486 |
+
"id": "wVdx5j24HKcp",
|
| 2487 |
+
"metadata": {
|
| 2488 |
+
"colab": {
|
| 2489 |
+
"base_uri": "https://localhost:8080/"
|
| 2490 |
+
},
|
| 2491 |
+
"id": "wVdx5j24HKcp",
|
| 2492 |
+
"outputId": "01d9c03a-d9d5-43b2-9626-e29ed85360d2"
|
| 2493 |
+
},
|
| 2494 |
+
"outputs": [
|
| 2495 |
+
{
|
| 2496 |
+
"output_type": "stream",
|
| 2497 |
+
"name": "stdout",
|
| 2498 |
+
"text": [
|
| 2499 |
+
"Installing dependencies...\n",
|
| 2500 |
+
"Hit:1 https://cli.github.com/packages stable InRelease\n",
|
| 2501 |
+
"Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease\n",
|
| 2502 |
+
"Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
|
| 2503 |
+
"Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
|
| 2504 |
+
"Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease\n",
|
| 2505 |
+
"Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease\n",
|
| 2506 |
+
"Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n",
|
| 2507 |
+
"Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease\n",
|
| 2508 |
+
"Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
|
| 2509 |
+
"Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
|
| 2510 |
+
"Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
|
| 2511 |
+
"Reading package lists... Done\n",
|
| 2512 |
+
"W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
|
| 2513 |
+
"Reading package lists... Done\n",
|
| 2514 |
+
"Building dependency tree... Done\n",
|
| 2515 |
+
"Reading state information... Done\n",
|
| 2516 |
+
"fonts-freefont-ttf is already the newest version (20120503-10build1).\n",
|
| 2517 |
+
"fonts-liberation is already the newest version (1:1.07.4-11).\n",
|
| 2518 |
+
"libasound2 is already the newest version (1.2.6.1-1ubuntu1).\n",
|
| 2519 |
+
"libatk-bridge2.0-0 is already the newest version (2.38.0-3).\n",
|
| 2520 |
+
"libatk1.0-0 is already the newest version (2.36.0-3build1).\n",
|
| 2521 |
+
"libatspi2.0-0 is already the newest version (2.44.0-3).\n",
|
| 2522 |
+
"libcairo-gobject2 is already the newest version (1.16.0-5ubuntu2).\n",
|
| 2523 |
+
"libcairo2 is already the newest version (1.16.0-5ubuntu2).\n",
|
| 2524 |
+
"libdbus-glib-1-2 is already the newest version (0.112-2build1).\n",
|
| 2525 |
+
"libegl1 is already the newest version (1.4.0-1).\n",
|
| 2526 |
+
"libenchant-2-2 is already the newest version (2.3.2-1ubuntu2).\n",
|
| 2527 |
+
"libepoxy0 is already the newest version (1.5.10-1).\n",
|
| 2528 |
+
"libevdev2 is already the newest version (1.12.1+dfsg-1).\n",
|
| 2529 |
+
"libevent-2.1-7 is already the newest version (2.1.12-stable-1build3).\n",
|
| 2530 |
+
"libfontconfig1 is already the newest version (2.13.1-4.2ubuntu5).\n",
|
| 2531 |
+
"libgles2 is already the newest version (1.4.0-1).\n",
|
| 2532 |
+
"libglx0 is already the newest version (1.4.0-1).\n",
|
| 2533 |
+
"libgudev-1.0-0 is already the newest version (1:237-2build1).\n",
|
| 2534 |
+
"libhyphen0 is already the newest version (2.8.8-7build2).\n",
|
| 2535 |
+
"libicu70 is already the newest version (70.1-2).\n",
|
| 2536 |
+
"libjpeg-turbo8 is already the newest version (2.1.2-0ubuntu1).\n",
|
| 2537 |
+
"liblcms2-2 is already the newest version (2.12~rc1-2build2).\n",
|
| 2538 |
+
"libmanette-0.2-0 is already the newest version (0.2.6-3build1).\n",
|
| 2539 |
+
"libopengl0 is already the newest version (1.4.0-1).\n",
|
| 2540 |
+
"libopus0 is already the newest version (1.3.1-0.1build2).\n",
|
| 2541 |
+
"libpng16-16 is already the newest version (1.6.37-3build5).\n",
|
| 2542 |
+
"libproxy1v5 is already the newest version (0.4.17-2).\n",
|
| 2543 |
+
"libsecret-1-0 is already the newest version (0.20.5-2).\n",
|
| 2544 |
+
"libwoff1 is already the newest version (1.0.2-1build4).\n",
|
| 2545 |
+
"libxcb-shm0 is already the newest version (1.14-3ubuntu3).\n",
|
| 2546 |
+
"libxcb1 is already the newest version (1.14-3ubuntu3).\n",
|
| 2547 |
+
"libxcomposite1 is already the newest version (1:0.4.5-1build2).\n",
|
| 2548 |
+
"libxcursor1 is already the newest version (1:1.2.0-2build4).\n",
|
| 2549 |
+
"libxdamage1 is already the newest version (1:1.1.5-2build2).\n",
|
| 2550 |
+
"libxext6 is already the newest version (2:1.3.4-1build1).\n",
|
| 2551 |
+
"libxfixes3 is already the newest version (1:6.0.0-1).\n",
|
| 2552 |
+
"libxi6 is already the newest version (2:1.8-1build1).\n",
|
| 2553 |
+
"libxkbcommon0 is already the newest version (1.4.0-1).\n",
|
| 2554 |
+
"libxrandr2 is already the newest version (2:1.5.2-1build1).\n",
|
| 2555 |
+
"libxrender1 is already the newest version (1:0.9.10-1build4).\n",
|
| 2556 |
+
"libxtst6 is already the newest version (2:1.2.3-1build4).\n",
|
| 2557 |
+
"xfonts-scalable is already the newest version (1:1.0.3-1.2ubuntu1).\n",
|
| 2558 |
+
"fonts-ipafont-gothic is already the newest version (00303-21ubuntu1).\n",
|
| 2559 |
+
"fonts-tlwg-loma-otf is already the newest version (1:0.7.3-1).\n",
|
| 2560 |
+
"fonts-unifont is already the newest version (1:14.0.01-1).\n",
|
| 2561 |
+
"fonts-wqy-zenhei is already the newest version (0.9.45-8).\n",
|
| 2562 |
+
"libavif13 is already the newest version (0.9.3-3).\n",
|
| 2563 |
+
"libffi7 is already the newest version (3.3-5ubuntu1).\n",
|
| 2564 |
+
"libx264-163 is already the newest version (2:0.163.3060+git5db6aa6-2build1).\n",
|
| 2565 |
+
"xfonts-cyrillic is already the newest version (1:1.0.5).\n",
|
| 2566 |
+
"fonts-noto-color-emoji is already the newest version (2.047-0ubuntu0.22.04.1).\n",
|
| 2567 |
+
"gstreamer1.0-plugins-base is already the newest version (1.20.1-1ubuntu0.5).\n",
|
| 2568 |
+
"gstreamer1.0-plugins-good is already the newest version (1.20.3-0ubuntu1.4).\n",
|
| 2569 |
+
"libatomic1 is already the newest version (12.3.0-1ubuntu1~22.04.2).\n",
|
| 2570 |
+
"libcups2 is already the newest version (2.4.1op1-1ubuntu4.12).\n",
|
| 2571 |
+
"libdbus-1-3 is already the newest version (1.12.20-2ubuntu4.1).\n",
|
| 2572 |
+
"libdrm2 is already the newest version (2.4.113-2~ubuntu0.22.04.1).\n",
|
| 2573 |
+
"libfreetype6 is already the newest version (2.11.1+dfsg-1ubuntu0.3).\n",
|
| 2574 |
+
"libgbm1 is already the newest version (23.2.1-1ubuntu3.1~22.04.3).\n",
|
| 2575 |
+
"libgdk-pixbuf-2.0-0 is already the newest version (2.42.8+dfsg-1ubuntu0.4).\n",
|
| 2576 |
+
"libglib2.0-0 is already the newest version (2.72.4-0ubuntu2.6).\n",
|
| 2577 |
+
"libgstreamer-gl1.0-0 is already the newest version (1.20.1-1ubuntu0.5).\n",
|
| 2578 |
+
"libgstreamer-plugins-base1.0-0 is already the newest version (1.20.1-1ubuntu0.5).\n",
|
| 2579 |
+
"libgstreamer1.0-0 is already the newest version (1.20.3-0ubuntu1.1).\n",
|
| 2580 |
+
"libgtk-3-0 is already the newest version (3.24.33-1ubuntu2.2).\n",
|
| 2581 |
+
"libgtk-4-1 is already the newest version (4.6.9+ds-0ubuntu0.22.04.2).\n",
|
| 2582 |
+
"libharfbuzz-icu0 is already the newest version (2.7.4-1ubuntu3.2).\n",
|
| 2583 |
+
"libharfbuzz0b is already the newest version (2.7.4-1ubuntu3.2).\n",
|
| 2584 |
+
"libnotify4 is already the newest version (0.7.9-3ubuntu5.22.04.1).\n",
|
| 2585 |
+
"libnspr4 is already the newest version (2:4.35-0ubuntu0.22.04.1).\n",
|
| 2586 |
+
"libnss3 is already the newest version (2:3.98-0ubuntu0.22.04.2).\n",
|
| 2587 |
+
"libopenjp2-7 is already the newest version (2.4.0-6ubuntu0.4).\n",
|
| 2588 |
+
"libpango-1.0-0 is already the newest version (1.50.6+ds-2ubuntu1).\n",
|
| 2589 |
+
"libpangocairo-1.0-0 is already the newest version (1.50.6+ds-2ubuntu1).\n",
|
| 2590 |
+
"libwayland-client0 is already the newest version (1.20.0-1ubuntu0.1).\n",
|
| 2591 |
+
"libwayland-egl1 is already the newest version (1.20.0-1ubuntu0.1).\n",
|
| 2592 |
+
"libwayland-server0 is already the newest version (1.20.0-1ubuntu0.1).\n",
|
| 2593 |
+
"libwebpdemux2 is already the newest version (1.2.2-2ubuntu0.22.04.2).\n",
|
| 2594 |
+
"libx11-6 is already the newest version (2:1.7.5-1ubuntu0.3).\n",
|
| 2595 |
+
"libx11-xcb1 is already the newest version (2:1.7.5-1ubuntu0.3).\n",
|
| 2596 |
+
"libxml2 is already the newest version (2.9.13+dfsg-1ubuntu0.9).\n",
|
| 2597 |
+
"libxslt1.1 is already the newest version (1.1.34-4ubuntu0.22.04.4).\n",
|
| 2598 |
+
"ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
|
| 2599 |
+
"gstreamer1.0-libav is already the newest version (1.20.3-0ubuntu1).\n",
|
| 2600 |
+
"gstreamer1.0-plugins-bad is already the newest version (1.20.3-0ubuntu1.1).\n",
|
| 2601 |
+
"libsoup-3.0-0 is already the newest version (3.0.7-0ubuntu1).\n",
|
| 2602 |
+
"xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.15).\n",
|
| 2603 |
+
"0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.\n"
|
| 2604 |
]
|
| 2605 |
}
|
| 2606 |
],
|
|
|
|
| 2611 |
},
|
| 2612 |
{
|
| 2613 |
"cell_type": "code",
|
| 2614 |
+
"execution_count": 6,
|
| 2615 |
"id": "uNgeNVFoMErV",
|
| 2616 |
"metadata": {
|
| 2617 |
"id": "uNgeNVFoMErV"
|
|
|
|
| 2619 |
"outputs": [],
|
| 2620 |
"source": [
|
| 2621 |
"import requests\n",
|
| 2622 |
+
"from pdfminer.high_level import extract_text\n",
|
| 2623 |
+
"import asyncio\n",
|
| 2624 |
+
"import aiohttp\n",
|
| 2625 |
+
"import json"
|
| 2626 |
]
|
| 2627 |
},
|
| 2628 |
{
|
| 2629 |
"cell_type": "code",
|
| 2630 |
+
"execution_count": 7,
|
| 2631 |
"id": "tvAnpg8zMA08",
|
| 2632 |
"metadata": {
|
| 2633 |
"id": "tvAnpg8zMA08"
|
| 2634 |
},
|
| 2635 |
"outputs": [],
|
| 2636 |
"source": [
|
| 2637 |
+
"async def extract_text_from_pdf(url: str, session: aiohttp.ClientSession) -> str | None:\n",
|
| 2638 |
" \"\"\"\n",
|
| 2639 |
+
" Tải file PDF từ URL và trích xuất văn bản nhanh bằng pdfminer.six (dùng session async)\n",
|
| 2640 |
" \"\"\"\n",
|
| 2641 |
+
" print(f\" -> Detect PDF link. Handle by pdfminer.six: {url}\")\n",
|
| 2642 |
" try:\n",
|
| 2643 |
+
" async with session.get(url, timeout=60) as response:\n",
|
| 2644 |
+
" if response.status != 200:\n",
|
| 2645 |
+
" print(f\"❌ Failed to download PDF ({response.status})\")\n",
|
| 2646 |
+
" return None\n",
|
| 2647 |
"\n",
|
| 2648 |
+
" data = await response.read()\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2649 |
"\n",
|
| 2650 |
+
" text = extract_text(io.BytesIO(data))\n",
|
| 2651 |
+
" print(\"✅ SUCCESS! Extracted text using pdfminer.six ---\")\n",
|
| 2652 |
+
" return text.strip()\n",
|
| 2653 |
"\n",
|
| 2654 |
" except Exception as e:\n",
|
| 2655 |
+
" print(f\"❌ Error when open file PDF {url}: {e}\")\n",
|
| 2656 |
" return None"
|
| 2657 |
]
|
| 2658 |
},
|
| 2659 |
{
|
| 2660 |
"cell_type": "code",
|
| 2661 |
+
"execution_count": 16,
|
| 2662 |
"id": "xSKWwAbIBwTu",
|
| 2663 |
"metadata": {
|
| 2664 |
"id": "xSKWwAbIBwTu"
|
|
|
|
| 2666 |
"outputs": [],
|
| 2667 |
"source": [
|
| 2668 |
"from playwright.async_api import async_playwright\n",
|
| 2669 |
+
"from playwright_stealth import Stealth\n",
|
| 2670 |
"import trafilatura"
|
| 2671 |
]
|
| 2672 |
},
|
| 2673 |
{
|
| 2674 |
"cell_type": "code",
|
| 2675 |
+
"execution_count": 17,
|
| 2676 |
"id": "6QF-79pKSBw1",
|
| 2677 |
"metadata": {
|
| 2678 |
"id": "6QF-79pKSBw1"
|
| 2679 |
},
|
| 2680 |
"outputs": [],
|
| 2681 |
"source": [
|
| 2682 |
+
"async def extract_text_from_web(url: str, session: aiohttp.ClientSession) -> str | None:\n",
|
| 2683 |
" \"\"\"\n",
|
| 2684 |
+
" Thử dùng trafilatura để trích xuất nội dung trước, nếu có lỗi\n",
|
| 2685 |
+
" thì dùng Playwright hoặc Stealth Playwright để lấy nội dung HTML.\n",
|
| 2686 |
" \"\"\"\n",
|
| 2687 |
+
" print(f\" -> Detect web link. Handle by Trafilatura: {url}\")\n",
|
| 2688 |
"\n",
|
|
|
|
| 2689 |
" try:\n",
|
| 2690 |
+
" # Fetch HTML bằng session (nhanh hơn nhiều so với trafilatura.fetch_url)\n",
|
| 2691 |
+
" async with session.get(url, timeout=30) as resp:\n",
|
| 2692 |
+
" if resp.status != 200:\n",
|
| 2693 |
+
" raise ValueError(f\"HTTP {resp.status}\")\n",
|
| 2694 |
+
" html_content = await resp.text()\n",
|
| 2695 |
"\n",
|
| 2696 |
+
" text = trafilatura.extract(html_content)\n",
|
| 2697 |
+
" if text:\n",
|
| 2698 |
+
" print(\"✅ SUCCESS! Extracted text using Trafilatura ---\")\n",
|
| 2699 |
+
" return text\n",
|
| 2700 |
"\n",
|
| 2701 |
+
" raise ValueError(\"Trafilatura extraction return None\")\n",
|
|
|
|
| 2702 |
"\n",
|
| 2703 |
+
" except Exception as e:\n",
|
| 2704 |
+
" print(f\"⚠️ Error using Trafilatura for URL {url}: {e}\")\n",
|
| 2705 |
+
" print(f\"Falling back to Playwright extraction ...\")\n",
|
| 2706 |
"\n",
|
| 2707 |
+
" try:\n",
|
| 2708 |
+
" async with async_playwright() as p:\n",
|
| 2709 |
+
" browser = await p.chromium.launch(headless=True)\n",
|
| 2710 |
+
" page = await browser.new_page()\n",
|
| 2711 |
"\n",
|
| 2712 |
+
" await page.goto(url, timeout=15000, wait_until=\"domcontentloaded\")\n",
|
| 2713 |
+
" html_content = await page.content()\n",
|
| 2714 |
+
" await browser.close()\n",
|
| 2715 |
+
"\n",
|
| 2716 |
+
" if not html_content:\n",
|
| 2717 |
+
" return None\n",
|
| 2718 |
+
"\n",
|
| 2719 |
+
" # Nếu bị Cloudflare block, fallback stealth\n",
|
| 2720 |
+
" if \"Cloudflare Ray ID\" in html_content:\n",
|
| 2721 |
+
" print(\"⚠️ Detected Cloudflare! Retrying with Stealth...\")\n",
|
| 2722 |
+
" async with Stealth().use_async(async_playwright()) as p2:\n",
|
| 2723 |
+
" browser = await p2.chromium.launch(headless=True)\n",
|
| 2724 |
+
" page = await browser.new_page()\n",
|
| 2725 |
+
" await page.goto(url, timeout=20000)\n",
|
| 2726 |
+
" html_content = await page.content()\n",
|
| 2727 |
+
" await browser.close()\n",
|
| 2728 |
+
"\n",
|
| 2729 |
+
" main_text = trafilatura.extract(html_content, include_comments=False)\n",
|
| 2730 |
+
" print(f\"✅ SUCCESS! Extracted text using Playwright ---\")\n",
|
| 2731 |
+
" return main_text\n",
|
| 2732 |
+
"\n",
|
| 2733 |
+
" except Exception as e1:\n",
|
| 2734 |
+
" print(f\"❌ Error using Playwright extraction for URL {url}: {e1}\")\n",
|
| 2735 |
+
" return None"
|
| 2736 |
]
|
| 2737 |
},
|
| 2738 |
{
|
| 2739 |
"cell_type": "code",
|
| 2740 |
+
"execution_count": 10,
|
| 2741 |
"id": "rL1vDTvHMwAj",
|
| 2742 |
"metadata": {
|
| 2743 |
"id": "rL1vDTvHMwAj"
|
| 2744 |
},
|
| 2745 |
"outputs": [],
|
| 2746 |
"source": [
|
| 2747 |
+
"async def fetch_content_from_url(url: str, session: aiohttp.ClientSession) -> str | None:\n",
|
| 2748 |
" \"\"\"\n",
|
| 2749 |
" Hàm điều phối: Kiểm tra loại URL và gọi hàm xử lý tương ứng.\n",
|
| 2750 |
" \"\"\"\n",
|
|
|
|
| 2751 |
" if url.lower().endswith('.pdf'):\n",
|
| 2752 |
+
" return await extract_text_from_pdf(url, session)\n",
|
| 2753 |
" else:\n",
|
| 2754 |
+
" return await extract_text_from_web(url, session)"
|
|
|
|
| 2755 |
]
|
| 2756 |
},
|
| 2757 |
{
|
| 2758 |
"cell_type": "code",
|
| 2759 |
+
"execution_count": 11,
|
| 2760 |
"id": "w3y0tq_pLIXu",
|
| 2761 |
"metadata": {
|
| 2762 |
"id": "w3y0tq_pLIXu"
|
|
|
|
| 2793 |
},
|
| 2794 |
{
|
| 2795 |
"cell_type": "code",
|
| 2796 |
+
"execution_count": 12,
|
| 2797 |
"id": "0eJs0RfoBz5o",
|
| 2798 |
"metadata": {
|
| 2799 |
"id": "0eJs0RfoBz5o"
|
| 2800 |
},
|
| 2801 |
"outputs": [],
|
| 2802 |
"source": [
|
| 2803 |
+
"import json\n",
|
| 2804 |
+
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
| 2805 |
+
"\n",
|
| 2806 |
"def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50) -> list[str]:\n",
|
| 2807 |
" \"\"\"Hàm tiện ích để chia văn bản dài thành các chunk nhỏ hơn.\"\"\"\n",
|
| 2808 |
" text_splitter = RecursiveCharacterTextSplitter(\n",
|
|
|
|
| 2815 |
},
|
| 2816 |
{
|
| 2817 |
"cell_type": "code",
|
| 2818 |
+
"execution_count": 13,
|
| 2819 |
+
"id": "BsLJSWmbHrPf",
|
| 2820 |
"metadata": {
|
| 2821 |
+
"id": "BsLJSWmbHrPf"
|
| 2822 |
},
|
| 2823 |
"outputs": [],
|
| 2824 |
"source": [
|
| 2825 |
+
"async def process_claims_parallel(retrieved_data):\n",
|
| 2826 |
+
" evidence_by_claim = {}\n",
|
| 2827 |
+
" claims = list(retrieved_data.keys())\n",
|
| 2828 |
+
"\n",
|
| 2829 |
+
" async with aiohttp.ClientSession() as session:\n",
|
| 2830 |
+
" for claim in claims:\n",
|
| 2831 |
+
" print(f\"\\n{'='*50}\\nHandle claim: '{claim}'\")\n",
|
| 2832 |
+
" documents = retrieved_data[claim]\n",
|
| 2833 |
+
" all_chunks_for_this_claim = []\n",
|
| 2834 |
+
"\n",
|
| 2835 |
+
" # Gom tất cả link cần crawl\n",
|
| 2836 |
+
" urls = [doc['link'] for doc in documents]\n",
|
| 2837 |
+
"\n",
|
| 2838 |
+
" # Chạy crawl song song\n",
|
| 2839 |
+
" print(f\" -> Crawling {len(urls)} links in parallel...\")\n",
|
| 2840 |
+
" tasks = [fetch_content_from_url(u, session) for u in urls]\n",
|
| 2841 |
+
" full_contents = await asyncio.gather(*tasks)\n",
|
| 2842 |
+
"\n",
|
| 2843 |
+
" # Ghép kết quả với từng doc\n",
|
| 2844 |
+
" for doc, full_content in zip(documents, full_contents):\n",
|
| 2845 |
+
" content_to_process = \"\"\n",
|
| 2846 |
+
"\n",
|
| 2847 |
+
" if full_content and len(full_content) > 100:\n",
|
| 2848 |
+
" print(f\"SUCCESS!! {doc['link']}\")\n",
|
| 2849 |
+
" cleaned_full_content = clean_text(full_content)\n",
|
| 2850 |
+
" content_to_process = f\"{doc.get('title', '')}. {cleaned_full_content}\"\n",
|
| 2851 |
+
" else:\n",
|
| 2852 |
+
" print(f\"FAIL!! Using snippet for {doc['link']}\")\n",
|
| 2853 |
+
" cleaned_snippet = clean_text(doc.get('snippet', ''))\n",
|
| 2854 |
+
" content_to_process = f\"{doc.get('title', '')}. {cleaned_snippet}\"\n",
|
| 2855 |
+
"\n",
|
| 2856 |
+
" # Chia nhỏ nội dung\n",
|
| 2857 |
+
" chunks = chunk_text(content_to_process)\n",
|
| 2858 |
+
"\n",
|
| 2859 |
+
" # Lưu lại\n",
|
| 2860 |
+
" for chunk_text_part in chunks:\n",
|
| 2861 |
+
" all_chunks_for_this_claim.append({\n",
|
| 2862 |
+
" \"text\": chunk_text_part,\n",
|
| 2863 |
+
" \"link\": doc['link']\n",
|
| 2864 |
+
" })\n",
|
| 2865 |
+
"\n",
|
| 2866 |
+
" evidence_by_claim[claim] = all_chunks_for_this_claim\n",
|
| 2867 |
+
" print(f\"==> Finish for claim '{claim}'. Total: {len(all_chunks_for_this_claim)} chunks.\")\n",
|
| 2868 |
+
"\n",
|
| 2869 |
+
" return evidence_by_claim"
|
| 2870 |
]
|
| 2871 |
},
|
| 2872 |
{
|
| 2873 |
"cell_type": "code",
|
| 2874 |
+
"source": [
|
| 2875 |
+
"# Tải dữ liệu\n",
|
| 2876 |
+
"with open('document_retrieval_results.json', 'r', encoding='utf-8') as f:\n",
|
| 2877 |
+
" retrieved_data = json.load(f)\n",
|
| 2878 |
+
"\n",
|
| 2879 |
+
"evidence_by_claim = await process_claims_parallel(retrieved_data)\n",
|
| 2880 |
+
"\n",
|
| 2881 |
+
"# Lưu lại nếu cần\n",
|
| 2882 |
+
"# with open('evidence_chunks.json', 'w', encoding='utf-8') as f:\n",
|
| 2883 |
+
"# json.dump(evidence_by_claim, f, ensure_ascii=False, indent=2)"
|
| 2884 |
+
],
|
| 2885 |
"metadata": {
|
| 2886 |
"colab": {
|
| 2887 |
"base_uri": "https://localhost:8080/"
|
| 2888 |
},
|
| 2889 |
+
"id": "hFOOzyCCTWEq",
|
| 2890 |
+
"outputId": "16b9e384-ac73-45c4-a64e-cf4e0c3a9059"
|
| 2891 |
},
|
| 2892 |
+
"id": "hFOOzyCCTWEq",
|
| 2893 |
+
"execution_count": 18,
|
| 2894 |
"outputs": [
|
| 2895 |
{
|
|
|
|
| 2896 |
"output_type": "stream",
|
| 2897 |
+
"name": "stdout",
|
| 2898 |
"text": [
|
| 2899 |
"\n",
|
| 2900 |
"==================================================\n",
|
| 2901 |
"Handle claim: 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '\n",
|
| 2902 |
+
" -> Crawling 12 links in parallel...\n",
|
| 2903 |
+
" -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
|
| 2904 |
+
" -> Detect web link. Handle by Trafilatura: https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
|
| 2905 |
+
" -> Detect web link. Handle by Trafilatura: https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
|
| 2906 |
+
" -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
|
| 2907 |
+
" -> Detect web link. Handle by Trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
|
| 2908 |
+
" -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
|
| 2909 |
+
" -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
|
| 2910 |
+
" -> Detect web link. Handle by Trafilatura: https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
|
| 2911 |
+
" -> Detect web link. Handle by Trafilatura: https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n",
|
| 2912 |
+
" -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
|
| 2913 |
+
" -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
|
| 2914 |
+
" -> Detect web link. Handle by Trafilatura: https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
|
| 2915 |
+
"⚠️ Error using Trafilatura for URL https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe: HTTP 403\n",
|
| 2916 |
+
"Falling back to Playwright extraction ...\n",
|
| 2917 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2918 |
+
"⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
|
| 2919 |
+
"Falling back to Playwright extraction ...\n",
|
| 2920 |
+
"⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
|
| 2921 |
+
"Falling back to Playwright extraction ...\n",
|
| 2922 |
+
"⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
|
| 2923 |
+
"Falling back to Playwright extraction ...\n",
|
| 2924 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2925 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2926 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2927 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2928 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2929 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2930 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2931 |
+
"⚠️ Detected Cloudflare! Retrying with Stealth...\n",
|
| 2932 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2933 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2934 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2935 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2936 |
+
"SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
|
| 2937 |
+
"SUCCESS!! https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
|
| 2938 |
+
"SUCCESS!! https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
|
| 2939 |
+
"SUCCESS!! https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
|
| 2940 |
+
"SUCCESS!! https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
|
| 2941 |
+
"SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
|
| 2942 |
+
"SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
|
| 2943 |
+
"SUCCESS!! https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
|
| 2944 |
+
"SUCCESS!! https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n"
|
| 2945 |
+
]
|
| 2946 |
+
},
|
| 2947 |
+
{
|
| 2948 |
+
"output_type": "stream",
|
| 2949 |
+
"name": "stderr",
|
| 2950 |
+
"text": [
|
| 2951 |
+
"WARNING:pdfminer.pdfpage:The PDF <_io.BytesIO object at 0x780575b075b0> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case\n"
|
| 2952 |
+
]
|
| 2953 |
+
},
|
| 2954 |
+
{
|
| 2955 |
+
"output_type": "stream",
|
| 2956 |
+
"name": "stdout",
|
| 2957 |
+
"text": [
|
| 2958 |
+
"SUCCESS!! https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
|
| 2959 |
+
"SUCCESS!! https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
|
| 2960 |
+
"SUCCESS!! https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
|
| 2961 |
+
"==> Finish for claim 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '. Total: 178 chunks.\n",
|
|
|
|
| 2962 |
"\n",
|
| 2963 |
"==================================================\n",
|
| 2964 |
"Handle claim: 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'\n",
|
| 2965 |
+
" -> Crawling 10 links in parallel...\n",
|
| 2966 |
+
" -> Detect web link. Handle by Trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
|
| 2967 |
+
" -> Detect web link. Handle by Trafilatura: https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
|
| 2968 |
+
" -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
|
| 2969 |
+
" -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
|
| 2970 |
+
" -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
|
| 2971 |
+
" -> Detect PDF link. Handle by pdfminer.six: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
|
| 2972 |
+
" -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
|
| 2973 |
+
" -> Detect web link. Handle by Trafilatura: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
|
| 2974 |
+
" -> Detect web link. Handle by Trafilatura: https://loigiaihay.com/bai-tap-245461.html\n",
|
| 2975 |
+
" -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
|
| 2976 |
+
"✅ SUCCESS! Extracted text using pdfminer.six ---\n",
|
| 2977 |
+
"✅ SUCCESS! Extracted text using pdfminer.six ---\n",
|
| 2978 |
+
"✅ SUCCESS! Extracted text using pdfminer.six ---\n",
|
| 2979 |
+
"⚠️ Error using Trafilatura for URL https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75: Cannot connect to host moh.gov.vn:443 ssl:default [Connect call failed ('103.124.60.20', 443)]\n",
|
| 2980 |
+
"Falling back to Playwright extraction ...\n",
|
| 2981 |
+
"⚠️ Error using Trafilatura for URL https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc: Cannot connect to host moh.gov.vn:443 ssl:default [Connect call failed ('103.124.60.20', 443)]\n",
|
| 2982 |
+
"Falling back to Playwright extraction ...\n",
|
| 2983 |
+
"✅ SUCCESS! Extracted text using pdfminer.six ---\n",
|
| 2984 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2985 |
+
"✅ SUCCESS! Extracted text using Trafilatura ---\n",
|
| 2986 |
+
"❌ Error using Playwright extraction for URL https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75: Page.goto: Download is starting\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2987 |
"Call log:\n",
|
| 2988 |
+
" - navigating to \"https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\", waiting until \"domcontentloaded\"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2989 |
"\n",
|
| 2990 |
+
"⚠️ Error using Trafilatura for URL https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/: \n",
|
| 2991 |
+
"Falling back to Playwright extraction ...\n",
|
| 2992 |
+
"⚠️ Error using Trafilatura for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: \n",
|
| 2993 |
+
"Falling back to Playwright extraction ...\n",
|
| 2994 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2995 |
+
"✅ SUCCESS! Extracted text using Playwright ---\n",
|
| 2996 |
+
"❌ Error using Playwright extraction for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: Page.goto: Timeout 15000ms exceeded.\n",
|
| 2997 |
"Call log:\n",
|
| 2998 |
+
" - navigating to \"http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\", waiting until \"domcontentloaded\"\n",
|
| 2999 |
"\n",
|
| 3000 |
+
"SUCCESS!! https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
|
| 3001 |
+
"SUCCESS!! https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
|
| 3002 |
+
"SUCCESS!! https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
|
| 3003 |
+
"SUCCESS!! https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
|
| 3004 |
+
"SUCCESS!! https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
|
| 3005 |
+
"SUCCESS!! https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
|
| 3006 |
+
"SUCCESS!! https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
|
| 3007 |
+
"FAIL!! Using snippet for http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
|
| 3008 |
+
"SUCCESS!! https://loigiaihay.com/bai-tap-245461.html\n",
|
| 3009 |
+
"FAIL!! Using snippet for https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
|
| 3010 |
+
"==> Finish for claim 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'. Total: 1853 chunks.\n"
|
| 3011 |
]
|
| 3012 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3013 |
]
|
| 3014 |
},
|
| 3015 |
{
|
| 3016 |
"cell_type": "code",
|
| 3017 |
+
"execution_count": null,
|
| 3018 |
"id": "-CcEWKwlpLII",
|
| 3019 |
"metadata": {
|
| 3020 |
"id": "-CcEWKwlpLII"
|
|
|
|
| 3026 |
},
|
| 3027 |
{
|
| 3028 |
"cell_type": "code",
|
| 3029 |
+
"execution_count": null,
|
| 3030 |
"id": "F2Wl6CytHxXu",
|
| 3031 |
"metadata": {
|
| 3032 |
"colab": {
|
|
|
|
| 3442 |
},
|
| 3443 |
{
|
| 3444 |
"cell_type": "code",
|
| 3445 |
+
"execution_count": null,
|
| 3446 |
"id": "9J1Z1TzdOBfX",
|
| 3447 |
"metadata": {
|
| 3448 |
"id": "9J1Z1TzdOBfX"
|
|
|
|
| 3454 |
},
|
| 3455 |
{
|
| 3456 |
"cell_type": "code",
|
| 3457 |
+
"execution_count": null,
|
| 3458 |
"id": "qYo7yMI9H1Uc",
|
| 3459 |
"metadata": {
|
| 3460 |
"colab": {
|
|
|
|
| 3729 |
},
|
| 3730 |
{
|
| 3731 |
"cell_type": "code",
|
| 3732 |
+
"execution_count": null,
|
| 3733 |
"id": "AHMdGO0JOECE",
|
| 3734 |
"metadata": {
|
| 3735 |
"id": "AHMdGO0JOECE"
|
|
|
|
| 3742 |
},
|
| 3743 |
{
|
| 3744 |
"cell_type": "code",
|
| 3745 |
+
"execution_count": null,
|
| 3746 |
"id": "B3CSkIO6FqEz",
|
| 3747 |
"metadata": {
|
| 3748 |
"colab": {
|
|
|
|
| 3940 |
},
|
| 3941 |
{
|
| 3942 |
"cell_type": "code",
|
| 3943 |
+
"execution_count": null,
|
| 3944 |
"id": "kBYXeNpdIRdt",
|
| 3945 |
"metadata": {
|
| 3946 |
"colab": {
|
|
|
|
| 12253 |
},
|
| 12254 |
"nbformat": 4,
|
| 12255 |
"nbformat_minor": 5
|
| 12256 |
+
}
|