Ming3993 commited on
Commit
117f8bb
·
1 Parent(s): 9ed1853

Module 4: Refine Crawler's logic

Browse files

- Use a session only during crawling process instead of opening many requests
for better time usage.
- Use multithread to crawl document concurrently.
- Use pdfminer.six for faster pdf crawling (although this tool will not
conserve pdf format but keyword extracting is enough in this use case)
- Add playwright Stealth method to avoid Cloudflare's blockage.

Files changed (1) hide show
  1. Baseline.ipynb +522 -283
Baseline.ipynb CHANGED
@@ -1154,7 +1154,7 @@
1154
  },
1155
  {
1156
  "cell_type": "code",
1157
- "execution_count": 1,
1158
  "id": "4437641d",
1159
  "metadata": {
1160
  "colab": {
@@ -1261,7 +1261,7 @@
1261
  },
1262
  {
1263
  "cell_type": "code",
1264
- "execution_count": 2,
1265
  "id": "0jkUpzEPhFLT",
1266
  "metadata": {
1267
  "colab": {
@@ -1347,7 +1347,9 @@
1347
  "cell_type": "code",
1348
  "execution_count": null,
1349
  "id": "18c52c1a",
1350
- "metadata": {},
 
 
1351
  "outputs": [],
1352
  "source": [
1353
  "import os\n",
@@ -1499,7 +1501,7 @@
1499
  " domain_bonus += 0.25\n",
1500
  " elif any(d in domain for d in BAD_DOMAINS):\n",
1501
  " penalty += 0.3\n",
1502
- " \n",
1503
  " # Language bonus (phát hiện tiếng Việt)\n",
1504
  " vietnamese_chars = re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', snippet)\n",
1505
  " lang_bonus = 0.1 if len(vietnamese_chars) > 5 else -0.1 # trừ nếu snippet không phải tiếng Việt\n",
@@ -1570,7 +1572,10 @@
1570
  "cell_type": "code",
1571
  "execution_count": null,
1572
  "id": "771734e4",
1573
- "metadata": {},
 
 
 
1574
  "outputs": [
1575
  {
1576
  "name": "stdout",
@@ -1615,34 +1620,62 @@
1615
  },
1616
  {
1617
  "cell_type": "code",
1618
- "execution_count": 16,
1619
- "id": "wVdx5j24HKcp",
 
 
 
 
 
 
1620
  "metadata": {
1621
  "colab": {
1622
  "base_uri": "https://localhost:8080/"
1623
  },
1624
- "id": "wVdx5j24HKcp",
1625
- "outputId": "0e94e218-63a3-4d08-e30d-3f4d2e2f25a8"
1626
  },
 
 
1627
  "outputs": [
1628
  {
1629
- "name": "stdout",
1630
  "output_type": "stream",
 
1631
  "text": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1632
  "Downloading Chromium 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip\u001b[22m\n",
1633
- "\u001b[1G173.7 MiB [] 0% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 0% 2.9s\u001b[0K\u001b[1G173.7 MiB [] 1% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 2% 4.6s\u001b[0K\u001b[1G173.7 MiB [] 3% 3.6s\u001b[0K\u001b[1G173.7 MiB [] 4% 3.1s\u001b[0K\u001b[1G173.7 MiB [] 5% 3.0s\u001b[0K\u001b[1G173.7 MiB [] 6% 2.7s\u001b[0K\u001b[1G173.7 MiB [] 6% 2.6s\u001b[0K\u001b[1G173.7 MiB [] 8% 2.3s\u001b[0K\u001b[1G173.7 MiB [] 9% 2.2s\u001b[0K\u001b[1G173.7 MiB [] 10% 2.1s\u001b[0K\u001b[1G173.7 MiB [] 12% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 13% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 14% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 15% 1.8s\u001b[0K\u001b[1G173.7 MiB [] 17% 1.7s\u001b[0K\u001b[1G173.7 MiB [] 18% 1.6s\u001b[0K\u001b[1G173.7 MiB [] 20% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.6s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.7s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.8s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.8s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 22% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 22% 2.1s\u001b[0K\u001b[1G173.7 MiB [] 23% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 24% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 25% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 26% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 28% 1.8s\u001b[0K\u001b[1G173.7 MiB [] 29% 1.8s\u001b[0K\u001b[1G173.7 MiB [] 31% 1.6s\u001b[0K\u001b[1G173.7 MiB [] 33% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 35% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 36% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 38% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 39% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 41% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 42% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 43% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 44% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 45% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 47% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 49% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 50% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 52% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 53% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 55% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 56% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 58% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 59% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 61% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 63% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 64% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 65% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 66% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 68% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 70% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 72% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 74% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 75% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 77% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 79% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 80% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 81% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 82% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 83% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 85% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 87% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 88% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 89% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 90% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 92% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 93% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 95% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 97% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 99% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 100% 0.0s\u001b[0K\n",
1634
  "Chromium 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium-1187\n",
1635
  "Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-linux.zip\u001b[22m\n",
1636
- "\u001b[1G104.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 0% 2.3s\u001b[0K\u001b[1G104.3 MiB [] 2% 1.5s\u001b[0K\u001b[1G104.3 MiB [] 3% 1.4s\u001b[0K\u001b[1G104.3 MiB [] 5% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 6% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 8% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 8% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 10% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 11% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 12% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 14% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 16% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 18% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 20% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 21% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 23% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 24% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 26% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 28% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 30% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 33% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 34% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 37% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 39% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 42% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 43% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 44% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 46% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 49% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 52% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 55% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 58% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 60% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 63% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 66% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 69% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 71% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 74% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 76% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 79% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 82% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 85% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 88% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 91% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 93% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 96% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 99% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 100% 0.0s\u001b[0K\n",
1637
  "Chromium Headless Shell 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium_headless_shell-1187\n",
1638
  "Downloading Firefox 141.0 (playwright build v1490)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/firefox/1490/firefox-ubuntu-22.04.zip\u001b[22m\n",
1639
- "\u001b[1G96 MiB [] 0% 0.0s\u001b[0K\u001b[1G96 MiB [] 0% 2.4s\u001b[0K\u001b[1G96 MiB [] 1% 1.6s\u001b[0K\u001b[1G96 MiB [] 3% 1.5s\u001b[0K\u001b[1G96 MiB [] 4% 1.3s\u001b[0K\u001b[1G96 MiB [] 6% 1.2s\u001b[0K\u001b[1G96 MiB [] 7% 1.2s\u001b[0K\u001b[1G96 MiB [] 9% 1.1s\u001b[0K\u001b[1G96 MiB [] 9% 1.3s\u001b[0K\u001b[1G96 MiB [] 10% 1.3s\u001b[0K\u001b[1G96 MiB [] 11% 1.2s\u001b[0K\u001b[1G96 MiB [] 12% 1.3s\u001b[0K\u001b[1G96 MiB [] 14% 1.2s\u001b[0K\u001b[1G96 MiB [] 15% 1.1s\u001b[0K\u001b[1G96 MiB [] 16% 1.2s\u001b[0K\u001b[1G96 MiB [] 17% 1.1s\u001b[0K\u001b[1G96 MiB [] 19% 1.1s\u001b[0K\u001b[1G96 MiB [] 19% 1.2s\u001b[0K\u001b[1G96 MiB [] 20% 1.4s\u001b[0K\u001b[1G96 MiB [] 20% 1.5s\u001b[0K\u001b[1G96 MiB [] 20% 1.6s\u001b[0K\u001b[1G96 MiB [] 20% 1.7s\u001b[0K\u001b[1G96 MiB [] 20% 1.8s\u001b[0K\u001b[1G96 MiB [] 20% 1.9s\u001b[0K\u001b[1G96 MiB [] 20% 2.0s\u001b[0K\u001b[1G96 MiB [] 22% 1.9s\u001b[0K\u001b[1G96 MiB [] 23% 1.8s\u001b[0K\u001b[1G96 MiB [] 25% 1.8s\u001b[0K\u001b[1G96 MiB [] 26% 1.7s\u001b[0K\u001b[1G96 MiB [] 27% 1.6s\u001b[0K\u001b[1G96 MiB [] 29% 1.5s\u001b[0K\u001b[1G96 MiB [] 30% 1.5s\u001b[0K\u001b[1G96 MiB [] 32% 1.4s\u001b[0K\u001b[1G96 MiB [] 34% 1.3s\u001b[0K\u001b[1G96 MiB [] 36% 1.3s\u001b[0K\u001b[1G96 MiB [] 37% 1.2s\u001b[0K\u001b[1G96 MiB [] 38% 1.2s\u001b[0K\u001b[1G96 MiB [] 39% 1.1s\u001b[0K\u001b[1G96 MiB [] 41% 1.1s\u001b[0K\u001b[1G96 MiB [] 43% 1.0s\u001b[0K\u001b[1G96 MiB [] 44% 1.0s\u001b[0K\u001b[1G96 MiB [] 45% 1.0s\u001b[0K\u001b[1G96 MiB [] 46% 1.0s\u001b[0K\u001b[1G96 MiB [] 47% 1.0s\u001b[0K\u001b[1G96 MiB [] 50% 0.9s\u001b[0K\u001b[1G96 MiB [] 51% 0.9s\u001b[0K\u001b[1G96 MiB [] 54% 0.8s\u001b[0K\u001b[1G96 MiB [] 56% 0.8s\u001b[0K\u001b[1G96 MiB [] 58% 0.7s\u001b[0K\u001b[1G96 MiB [] 60% 0.7s\u001b[0K\u001b[1G96 MiB [] 61% 0.6s\u001b[0K\u001b[1G96 MiB [] 62% 0.6s\u001b[0K\u001b[1G96 MiB [] 64% 0.6s\u001b[0K\u001b[1G96 MiB [] 65% 0.6s\u001b[0K\u001b[1G96 MiB [] 66% 0.6s\u001b[0K\u001b[1G96 MiB [] 68% 0.5s\u001b[0K\u001b[1G96 MiB [] 70% 0.5s\u001b[0K\u001b[1G96 MiB [] 73% 0.4s\u001b[0K\u001b[1G96 MiB [] 75% 0.4s\u001b[0K\u001b[1G96 MiB [] 76% 0.4s\u001b[0K\u001b[1G96 MiB [] 78% 0.3s\u001b[0K\u001b[1G96 MiB [] 80% 0.3s\u001b[0K\u001b[1G96 MiB [] 83% 0.3s\u001b[0K\u001b[1G96 MiB [] 85% 0.2s\u001b[0K\u001b[1G96 MiB [] 87% 0.2s\u001b[0K\u001b[1G96 MiB [] 89% 0.2s\u001b[0K\u001b[1G96 MiB [] 91% 0.1s\u001b[0K\u001b[1G96 MiB [] 92% 0.1s\u001b[0K\u001b[1G96 MiB [] 94% 0.1s\u001b[0K\u001b[1G96 MiB [] 96% 0.0s\u001b[0K\u001b[1G96 MiB [] 98% 0.0s\u001b[0K\u001b[1G96 MiB [] 100% 0.0s\u001b[0K\n",
1640
  "Firefox 141.0 (playwright build v1490) downloaded to /root/.cache/ms-playwright/firefox-1490\n",
1641
  "Downloading Webkit 26.0 (playwright build v2203)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/webkit/2203/webkit-ubuntu-22.04.zip\u001b[22m\n",
1642
- "\u001b[1G94.6 MiB [] 0% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 0% 1.6s\u001b[0K\u001b[1G94.6 MiB [] 2% 1.2s\u001b[0K\u001b[1G94.6 MiB [] 4% 1.0s\u001b[0K\u001b[1G94.6 MiB [] 6% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 8% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 9% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 11% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 13% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 15% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 16% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 18% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 20% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 23% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 24% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 26% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 28% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 31% 0.6s\u001b[0K\u001b[1G94.6 MiB [] 33% 0.6s\u001b[0K\u001b[1G94.6 MiB [] 36% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 38% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 41% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 43% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 46% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 47% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 48% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 50% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 53% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 56% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 59% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 62% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 65% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 67% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 70% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 73% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 75% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 77% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 79% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 81% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 83% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 84% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 85% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 85% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 86% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 87% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 87% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 88% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 92% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 92% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 94% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 95% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 98% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 99% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 100% 0.0s\u001b[0K\n",
1643
  "Webkit 26.0 (playwright build v2203) downloaded to /root/.cache/ms-playwright/webkit-2203\n",
1644
  "Downloading FFMPEG playwright build v1011\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/ffmpeg/1011/ffmpeg-linux.zip\u001b[22m\n",
1645
- "\u001b[1G2.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 48% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 100% 0.0s\u001b[0K\n",
1646
  "FFMPEG playwright build v1011 downloaded to /root/.cache/ms-playwright/ffmpeg-1011\n",
1647
  "Playwright Host validation warning: \n",
1648
  "╔══════════════════════════════════════════════════════╗\n",
@@ -1659,33 +1692,30 @@
1659
  "║ libmanette-0.2.so.0 ║\n",
1660
  "╚══════════════════════════════════════════════════════╝\n",
1661
  " at validateDependenciesLinux (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)\n",
1662
- "\u001b[90m at process.processTicksAndRejections (node:internal/process/task_queues:105:5)\u001b[39m\n",
1663
  " at async Registry._validateHostRequirements (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:934:14)\n",
1664
  " at async Registry._validateHostRequirementsForExecutableIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1056:7)\n",
1665
  " at async Registry.validateHostRequirementsForExecutablesIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1045:7)\n",
1666
  " at async i.<anonymous> (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/cli/program.js:217:7)\n",
1667
  "Installing dependencies...\n",
1668
  "Hit:1 https://cli.github.com/packages stable InRelease\n",
1669
- "Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n",
1670
- "Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
1671
- "Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
1672
- "Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
1673
- "Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
1674
- "Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n",
1675
- "Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]\n",
1676
- "Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
1677
- "Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]\n",
1678
- "Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,811 kB]\n",
1679
- "Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,750 kB]\n",
1680
- "Hit:13 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
1681
- "Get:14 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [32.8 kB]\n",
1682
- "Get:15 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [5,727 kB]\n",
1683
- "Get:16 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy/main amd64 Packages [44.9 kB]\n",
1684
- "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,582 kB]\n",
1685
- "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,348 kB]\n",
1686
- "Get:19 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,425 kB]\n",
1687
- "Get:20 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,276 kB]\n",
1688
- "Fetched 28.4 MB in 5s (6,112 kB/s)\n",
1689
  "Reading package lists... Done\n",
1690
  "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
1691
  "Reading package lists... Done\n",
@@ -1844,7 +1874,7 @@
1844
  " libwildmidi2 libwoff1 libxtst6 libyuv0 libzbar0 libzxingcore1\n",
1845
  " session-migration timgm6mb-soundfont xfonts-cyrillic xfonts-encodings\n",
1846
  " xfonts-scalable xfonts-utils\n",
1847
- "0 upgraded, 94 newly installed, 0 to remove and 42 not upgraded.\n",
1848
  "Need to get 48.2 MB of archives.\n",
1849
  "After this operation, 123 MB of additional disk space will be used.\n",
1850
  "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-ipafont-gothic all 00303-21ubuntu1 [3,513 kB]\n",
@@ -1941,7 +1971,7 @@
1941
  "Get:92 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-aacenc0 amd64 0.1.3-2 [69.4 kB]\n",
1942
  "Get:93 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-amrwbenc0 amd64 0.1.3-2 [68.2 kB]\n",
1943
  "Get:94 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 gstreamer1.0-plugins-bad amd64 1.20.3-0ubuntu1.1 [2,602 kB]\n",
1944
- "Fetched 48.2 MB in 7s (6,490 kB/s)\n",
1945
  "Extracting templates from packages: 100%\n",
1946
  "Preconfiguring packages ...\n",
1947
  "Selecting previously unselected package fonts-ipafont-gothic.\n",
@@ -2321,33 +2351,33 @@
2321
  "Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...\n",
2322
  "Processing triggers for libglib2.0-0:amd64 (2.72.4-0ubuntu2.6) ...\n",
2323
  "Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
2324
- "/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
2325
  "\n",
2326
- "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
2327
  "\n",
2328
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
2329
  "\n",
2330
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
2331
  "\n",
2332
  "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
2333
  "\n",
2334
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
2335
  "\n",
2336
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
2337
  "\n",
2338
- "/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
2339
  "\n",
2340
- "/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
2341
  "\n",
2342
- "/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
2343
  "\n",
2344
  "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
2345
  "\n",
2346
- "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
2347
  "\n",
2348
- "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
2349
  "\n",
2350
- "/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
2351
  "\n",
2352
  "Setting up glib-networking:amd64 (2.72.0-1) ...\n",
2353
  "Setting up libsoup2.4-1:amd64 (2.74.2-3ubuntu0.6) ...\n",
@@ -2360,34 +2390,217 @@
2360
  "Setting up gstreamer1.0-plugins-bad:amd64 (1.20.3-0ubuntu1.1) ...\n",
2361
  "Processing triggers for dictionaries-common (1.28.14) ...\n",
2362
  "Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
2363
- "/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
2364
  "\n",
2365
- "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
2366
  "\n",
2367
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
2368
  "\n",
2369
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
2370
  "\n",
2371
  "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
2372
  "\n",
2373
- "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
2374
  "\n",
2375
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
2376
  "\n",
2377
- "/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
2378
  "\n",
2379
- "/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
2380
  "\n",
2381
- "/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
2382
  "\n",
2383
  "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
2384
  "\n",
 
 
2385
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
2386
  "\n",
2387
- "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
2388
  "\n",
2389
- "/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
2390
- "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2391
  ]
2392
  }
2393
  ],
@@ -2398,7 +2611,7 @@
2398
  },
2399
  {
2400
  "cell_type": "code",
2401
- "execution_count": 17,
2402
  "id": "uNgeNVFoMErV",
2403
  "metadata": {
2404
  "id": "uNgeNVFoMErV"
@@ -2406,48 +2619,46 @@
2406
  "outputs": [],
2407
  "source": [
2408
  "import requests\n",
2409
- "import pdfplumber\n",
2410
- "import io"
 
 
2411
  ]
2412
  },
2413
  {
2414
  "cell_type": "code",
2415
- "execution_count": 18,
2416
  "id": "tvAnpg8zMA08",
2417
  "metadata": {
2418
  "id": "tvAnpg8zMA08"
2419
  },
2420
  "outputs": [],
2421
  "source": [
2422
- "async def extract_text_from_pdf(url: str) -> str | None:\n",
2423
  " \"\"\"\n",
2424
- " Tải file PDF từ URL và trích xuất nội dung văn bản.\n",
2425
  " \"\"\"\n",
2426
- " print(f\" -> Detect PDF link. Handle by pdfplumber: {url}\")\n",
2427
  " try:\n",
2428
- " # Tải nội dung file PDF vào bộ nhớ\n",
2429
- " response = requests.get(url, timeout=60)\n",
2430
- " response.raise_for_status() # Báo lỗi nếu tải không thành công\n",
 
2431
  "\n",
2432
- " # Mở file PDF từ dữ liệu bytes trong bộ nhớ\n",
2433
- " with pdfplumber.open(io.BytesIO(response.content)) as pdf:\n",
2434
- " full_text = []\n",
2435
- " for page in pdf.pages:\n",
2436
- " text = page.extract_text()\n",
2437
- " if text:\n",
2438
- " full_text.append(text)\n",
2439
  "\n",
2440
- " print(f\"SUCCESS! Context extract from Playwright ---\")\n",
2441
- " return \"\\n\".join(full_text)\n",
 
2442
  "\n",
2443
  " except Exception as e:\n",
2444
- " print(f\"Error when open file PDF {url}: {e}\")\n",
2445
  " return None"
2446
  ]
2447
  },
2448
  {
2449
  "cell_type": "code",
2450
- "execution_count": 19,
2451
  "id": "xSKWwAbIBwTu",
2452
  "metadata": {
2453
  "id": "xSKWwAbIBwTu"
@@ -2455,71 +2666,97 @@
2455
  "outputs": [],
2456
  "source": [
2457
  "from playwright.async_api import async_playwright\n",
 
2458
  "import trafilatura"
2459
  ]
2460
  },
2461
  {
2462
  "cell_type": "code",
2463
- "execution_count": 20,
2464
  "id": "6QF-79pKSBw1",
2465
  "metadata": {
2466
  "id": "6QF-79pKSBw1"
2467
  },
2468
  "outputs": [],
2469
  "source": [
2470
- "async def extract_text_from_web(url: str) -> str | None:\n",
2471
  " \"\"\"\n",
2472
- " Sử dụng phiên bản Async của Playwright để điều khiển trình duyệt,\n",
2473
- " lấy HTML, sau đó dùng trafilatura để trích xuất nội dung.\n",
2474
  " \"\"\"\n",
 
2475
  "\n",
2476
- " print(f\" -> Detect web link. Handle by trafilatura: {url}\")\n",
2477
  " try:\n",
2478
- " async with async_playwright() as p:\n",
2479
- " browser = await p.chromium.launch(headless=True)\n",
2480
- " page = await browser.new_page()\n",
 
 
2481
  "\n",
2482
- " await page.goto(url, timeout=90000, wait_until=\"domcontentloaded\")\n",
 
 
 
2483
  "\n",
2484
- " html_content = await page.content()\n",
2485
- " await browser.close()\n",
2486
  "\n",
2487
- " if not html_content:\n",
2488
- " return None\n",
 
2489
  "\n",
2490
- " main_text = trafilatura.extract(html_content, include_comments=False)\n",
2491
- " print(f\"SUCCESS! Context extract from Playwright ---\")\n",
2492
- " return main_text\n",
 
2493
  "\n",
2494
- " except Exception as e:\n",
2495
- " print(f\"Error for using trafilatura for URL {url}: {e}\")\n",
2496
- " return None"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2497
  ]
2498
  },
2499
  {
2500
  "cell_type": "code",
2501
- "execution_count": 21,
2502
  "id": "rL1vDTvHMwAj",
2503
  "metadata": {
2504
  "id": "rL1vDTvHMwAj"
2505
  },
2506
  "outputs": [],
2507
  "source": [
2508
- "async def fetch_content_from_url(url: str) -> str | None:\n",
2509
  " \"\"\"\n",
2510
  " Hàm điều phối: Kiểm tra loại URL và gọi hàm xử lý tương ứng.\n",
2511
  " \"\"\"\n",
2512
- " # Chuyển URL về chữ thường để kiểm tra đuôi file\n",
2513
  " if url.lower().endswith('.pdf'):\n",
2514
- " return await extract_text_from_pdf(url)\n",
2515
  " else:\n",
2516
- " # Sử dụng lại hàm đã thành công cho HTML\n",
2517
- " return await extract_text_from_web(url)"
2518
  ]
2519
  },
2520
  {
2521
  "cell_type": "code",
2522
- "execution_count": 22,
2523
  "id": "w3y0tq_pLIXu",
2524
  "metadata": {
2525
  "id": "w3y0tq_pLIXu"
@@ -2556,13 +2793,16 @@
2556
  },
2557
  {
2558
  "cell_type": "code",
2559
- "execution_count": 23,
2560
  "id": "0eJs0RfoBz5o",
2561
  "metadata": {
2562
  "id": "0eJs0RfoBz5o"
2563
  },
2564
  "outputs": [],
2565
  "source": [
 
 
 
2566
  "def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50) -> list[str]:\n",
2567
  " \"\"\"Hàm tiện ích để chia văn bản dài thành các chunk nhỏ hơn.\"\"\"\n",
2568
  " text_splitter = RecursiveCharacterTextSplitter(\n",
@@ -2575,207 +2815,206 @@
2575
  },
2576
  {
2577
  "cell_type": "code",
2578
- "execution_count": 24,
2579
- "id": "01a1ffM5ByR3",
2580
  "metadata": {
2581
- "id": "01a1ffM5ByR3"
2582
  },
2583
  "outputs": [],
2584
  "source": [
2585
- "import json\n",
2586
- "from langchain.text_splitter import RecursiveCharacterTextSplitter"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2587
  ]
2588
  },
2589
  {
2590
  "cell_type": "code",
2591
- "execution_count": 25,
2592
- "id": "BsLJSWmbHrPf",
 
 
 
 
 
 
 
 
 
2593
  "metadata": {
2594
  "colab": {
2595
  "base_uri": "https://localhost:8080/"
2596
  },
2597
- "id": "BsLJSWmbHrPf",
2598
- "outputId": "2abfbfcf-9678-4d45-89a2-d5d338204df3"
2599
  },
 
 
2600
  "outputs": [
2601
  {
2602
- "name": "stdout",
2603
  "output_type": "stream",
 
2604
  "text": [
2605
  "\n",
2606
  "==================================================\n",
2607
  "Handle claim: 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '\n",
2608
- "\n",
2609
- " -> Crawling: https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
2610
- " -> Detect web link. Handle by trafilatura: https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
2611
- "SUCCESS! Context extract from Playwright ---\n",
2612
- "SUCCESS!! Using full content\n",
2613
- "\n",
2614
- " -> Crawling: https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
2615
- " -> Detect web link. Handle by trafilatura: https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
2616
- "SUCCESS! Context extract from Playwright ---\n",
2617
- "SUCCESS!! Using full content\n",
2618
- "\n",
2619
- " -> Crawling: https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
2620
- " -> Detect web link. Handle by trafilatura: https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
2621
- "SUCCESS! Context extract from Playwright ---\n",
2622
- "SUCCESS!! Using full content\n",
2623
- "\n",
2624
- " -> Crawling: https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
2625
- " -> Detect web link. Handle by trafilatura: https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
2626
- "SUCCESS! Context extract from Playwright ---\n",
2627
- "SUCCESS!! Using full content\n",
2628
- "\n",
2629
- " -> Crawling: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2630
- " -> Detect web link. Handle by trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2631
- "SUCCESS! Context extract from Playwright ---\n",
2632
- "SUCCESS!! Using full content\n",
2633
- "\n",
2634
- " -> Crawling: https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
2635
- " -> Detect web link. Handle by trafilatura: https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
2636
- "SUCCESS! Context extract from Playwright ---\n",
2637
- "SUCCESS!! Using full content\n",
2638
- "\n",
2639
- " -> Crawling: https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
2640
- " -> Detect web link. Handle by trafilatura: https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
2641
- "SUCCESS! Context extract from Playwright ---\n",
2642
- "SUCCESS!! Using full content\n",
2643
- "\n",
2644
- " -> Crawling: https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
2645
- " -> Detect web link. Handle by trafilatura: https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
2646
- "SUCCESS! Context extract from Playwright ---\n",
2647
- "SUCCESS!! Using full content\n",
2648
- "\n",
2649
- " -> Crawling: https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n",
2650
- " -> Detect web link. Handle by trafilatura: https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n",
2651
- "SUCCESS! Context extract from Playwright ---\n",
2652
- "SUCCESS!! Using full content\n",
2653
- "\n",
2654
- " -> Crawling: https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
2655
- " -> Detect web link. Handle by trafilatura: https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
2656
- "SUCCESS! Context extract from Playwright ---\n",
2657
- "SUCCESS!! Using full content\n",
2658
- "\n",
2659
- " -> Crawling: https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
2660
- " -> Detect web link. Handle by trafilatura: https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
2661
- "SUCCESS! Context extract from Playwright ---\n",
2662
- "SUCCESS!! Using full content\n",
2663
- "\n",
2664
- " -> Crawling: https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
2665
- " -> Detect web link. Handle by trafilatura: https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
2666
- "SUCCESS! Context extract from Playwright ---\n",
2667
- "SUCCESS!! Using full content\n",
2668
- "==> Finish for claim 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '. Total: 149 chunks.\n",
2669
  "\n",
2670
  "==================================================\n",
2671
  "Handle claim: 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'\n",
2672
- "\n",
2673
- " -> Crawling: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2674
- " -> Detect web link. Handle by trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2675
- "SUCCESS! Context extract from Playwright ---\n",
2676
- "SUCCESS!! Using full content\n",
2677
- "\n",
2678
- " -> Crawling: https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
2679
- " -> Detect web link. Handle by trafilatura: https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
2680
- "SUCCESS! Context extract from Playwright ---\n",
2681
- "SUCCESS!! Using full content\n",
2682
- "\n",
2683
- " -> Crawling: https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
2684
- " -> Detect PDF link. Handle by pdfplumber: https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
2685
- "SUCCESS! Context extract from Playwright ---\n",
2686
- "SUCCESS!! Using full content\n",
2687
- "\n",
2688
- " -> Crawling: https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
2689
- " -> Detect PDF link. Handle by pdfplumber: https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
2690
- "SUCCESS! Context extract from Playwright ---\n",
2691
- "SUCCESS!! Using full content\n",
2692
- "\n",
2693
- " -> Crawling: https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
2694
- " -> Detect web link. Handle by trafilatura: https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
2695
- "SUCCESS! Context extract from Playwright ---\n",
2696
- "SUCCESS!! Using full content\n",
2697
- "\n",
2698
- " -> Crawling: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
2699
- " -> Detect PDF link. Handle by pdfplumber: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
2700
- "SUCCESS! Context extract from Playwright ---\n",
2701
- "SUCCESS!! Using full content\n",
2702
- "\n",
2703
- " -> Crawling: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
2704
- " -> Detect PDF link. Handle by pdfplumber: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
2705
- "SUCCESS! Context extract from Playwright ---\n",
2706
- "SUCCESS!! Using full content\n",
2707
- "\n",
2708
- " -> Crawling: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
2709
- " -> Detect web link. Handle by trafilatura: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
2710
- "Error for using trafilatura for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: Page.goto: Timeout 90000ms exceeded.\n",
2711
  "Call log:\n",
2712
- " - navigating to \"http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\", waiting until \"domcontentloaded\"\n",
2713
- "\n",
2714
- "FAIL!! Using snippet as backup\n",
2715
- "\n",
2716
- " -> Crawling: https://loigiaihay.com/bai-tap-245461.html\n",
2717
- " -> Detect web link. Handle by trafilatura: https://loigiaihay.com/bai-tap-245461.html\n",
2718
- "SUCCESS! Context extract from Playwright ---\n",
2719
- "SUCCESS!! Using full content\n",
2720
  "\n",
2721
- " -> Crawling: https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
2722
- " -> Detect web link. Handle by trafilatura: https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
2723
- "Error for using trafilatura for URL https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75: Page.goto: Download is starting\n",
 
 
 
 
2724
  "Call log:\n",
2725
- " - navigating to \"https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\", waiting until \"domcontentloaded\"\n",
2726
  "\n",
2727
- "FAIL!! Using snippet as backup\n",
2728
- "==> Finish for claim 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'. Total: 1844 chunks.\n"
 
 
 
 
 
 
 
 
 
2729
  ]
2730
  }
2731
- ],
2732
- "source": [
2733
- "# Tải dữ liệu từ file\n",
2734
- "with open('document_retrieval_results.json', 'r', encoding='utf-8') as f:\n",
2735
- " retrieved_data = json.load(f)\n",
2736
- "\n",
2737
- "# Tổ chức lại evidence theo từng claim\n",
2738
- "evidence_by_claim = {}\n",
2739
- "claims = list(retrieved_data.keys())\n",
2740
- "\n",
2741
- "for claim in claims:\n",
2742
- " print(f\"\\n{'='*50}\\nHandle claim: '{claim}'\")\n",
2743
- " documents = retrieved_data[claim]\n",
2744
- " all_chunks_for_this_claim = []\n",
2745
- "\n",
2746
- " for doc in documents:\n",
2747
- " # 1. Thử crawl để lấy nội dung đầy đủ\n",
2748
- " print(f\"\\n -> Crawling: {doc['link']}\")\n",
2749
- " full_content = await fetch_content_from_url(doc['link'])\n",
2750
- "\n",
2751
- " # 2. Chiến lược Fallback: Nếu crawl lỗi, dùng tạm snippet\n",
2752
- " content_to_process = \"\"\n",
2753
- " if full_content and len(full_content) > 100: # Kiểm tra nội dung có đáng kể không\n",
2754
- " print(\"SUCCESS!! Using full content\")\n",
2755
- " cleaned_full_content = clean_text(full_content)\n",
2756
- " content_to_process = f\"{doc.get('title', '')}. {cleaned_full_content}\"\n",
2757
- " else:\n",
2758
- " print(\"FAIL!! Using snippet as backup\")\n",
2759
- " cleaned_snippet = clean_text(doc.get('snippet', ''))\n",
2760
- " content_to_process = f\"{doc.get('title', '')}. {cleaned_snippet}\"\n",
2761
- "\n",
2762
- " # 3. Bắt buộc: Chia nhỏ nội dung thành các chunks\n",
2763
- " chunks = chunk_text(content_to_process)\n",
2764
- "\n",
2765
- " # 4. Lưu các chunks với cấu trúc dữ liệu nhất quán\n",
2766
- " for chunk_text_part in chunks:\n",
2767
- " all_chunks_for_this_claim.append({\n",
2768
- " \"text\": chunk_text_part,\n",
2769
- " \"link\": doc['link'] # Giữ lại nguồn của chunk\n",
2770
- " })\n",
2771
- "\n",
2772
- " evidence_by_claim[claim] = all_chunks_for_this_claim\n",
2773
- " print(f\"==> Finish for claim '{claim}'. Total: {len(all_chunks_for_this_claim)} chunks.\")"
2774
  ]
2775
  },
2776
  {
2777
  "cell_type": "code",
2778
- "execution_count": 29,
2779
  "id": "-CcEWKwlpLII",
2780
  "metadata": {
2781
  "id": "-CcEWKwlpLII"
@@ -2787,7 +3026,7 @@
2787
  },
2788
  {
2789
  "cell_type": "code",
2790
- "execution_count": 26,
2791
  "id": "F2Wl6CytHxXu",
2792
  "metadata": {
2793
  "colab": {
@@ -3203,7 +3442,7 @@
3203
  },
3204
  {
3205
  "cell_type": "code",
3206
- "execution_count": 27,
3207
  "id": "9J1Z1TzdOBfX",
3208
  "metadata": {
3209
  "id": "9J1Z1TzdOBfX"
@@ -3215,7 +3454,7 @@
3215
  },
3216
  {
3217
  "cell_type": "code",
3218
- "execution_count": 28,
3219
  "id": "qYo7yMI9H1Uc",
3220
  "metadata": {
3221
  "colab": {
@@ -3490,7 +3729,7 @@
3490
  },
3491
  {
3492
  "cell_type": "code",
3493
- "execution_count": 30,
3494
  "id": "AHMdGO0JOECE",
3495
  "metadata": {
3496
  "id": "AHMdGO0JOECE"
@@ -3503,7 +3742,7 @@
3503
  },
3504
  {
3505
  "cell_type": "code",
3506
- "execution_count": 31,
3507
  "id": "B3CSkIO6FqEz",
3508
  "metadata": {
3509
  "colab": {
@@ -3701,7 +3940,7 @@
3701
  },
3702
  {
3703
  "cell_type": "code",
3704
- "execution_count": 32,
3705
  "id": "kBYXeNpdIRdt",
3706
  "metadata": {
3707
  "colab": {
@@ -12014,4 +12253,4 @@
12014
  },
12015
  "nbformat": 4,
12016
  "nbformat_minor": 5
12017
- }
 
1154
  },
1155
  {
1156
  "cell_type": "code",
1157
+ "execution_count": null,
1158
  "id": "4437641d",
1159
  "metadata": {
1160
  "colab": {
 
1261
  },
1262
  {
1263
  "cell_type": "code",
1264
+ "execution_count": null,
1265
  "id": "0jkUpzEPhFLT",
1266
  "metadata": {
1267
  "colab": {
 
1347
  "cell_type": "code",
1348
  "execution_count": null,
1349
  "id": "18c52c1a",
1350
+ "metadata": {
1351
+ "id": "18c52c1a"
1352
+ },
1353
  "outputs": [],
1354
  "source": [
1355
  "import os\n",
 
1501
  " domain_bonus += 0.25\n",
1502
  " elif any(d in domain for d in BAD_DOMAINS):\n",
1503
  " penalty += 0.3\n",
1504
+ "\n",
1505
  " # Language bonus (phát hiện tiếng Việt)\n",
1506
  " vietnamese_chars = re.findall(r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', snippet)\n",
1507
  " lang_bonus = 0.1 if len(vietnamese_chars) > 5 else -0.1 # trừ nếu snippet không phải tiếng Việt\n",
 
1572
  "cell_type": "code",
1573
  "execution_count": null,
1574
  "id": "771734e4",
1575
+ "metadata": {
1576
+ "id": "771734e4",
1577
+ "outputId": "92cc3e56-0a5a-4626-c90f-ed39ac6c5a66"
1578
+ },
1579
  "outputs": [
1580
  {
1581
  "name": "stdout",
 
1620
  },
1621
  {
1622
  "cell_type": "code",
1623
+ "source": [
1624
+ "!pip install playwright\n",
1625
+ "!pip install playwright-stealth\n",
1626
+ "!playwright install\n",
1627
+ "!playwright install-deps\n",
1628
+ "!pip install pdfplumber\n",
1629
+ "!pip install trafilatura"
1630
+ ],
1631
  "metadata": {
1632
  "colab": {
1633
  "base_uri": "https://localhost:8080/"
1634
  },
1635
+ "id": "IAxlvGzQULAZ",
1636
+ "outputId": "07b66062-0c78-4130-c75e-4a3132ff705d"
1637
  },
1638
+ "id": "IAxlvGzQULAZ",
1639
+ "execution_count": 2,
1640
  "outputs": [
1641
  {
 
1642
  "output_type": "stream",
1643
+ "name": "stdout",
1644
  "text": [
1645
+ "Collecting playwright\n",
1646
+ " Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)\n",
1647
+ "Collecting pyee<14,>=13 (from playwright)\n",
1648
+ " Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)\n",
1649
+ "Requirement already satisfied: greenlet<4.0.0,>=3.1.1 in /usr/local/lib/python3.12/dist-packages (from playwright) (3.2.4)\n",
1650
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from pyee<14,>=13->playwright) (4.15.0)\n",
1651
+ "Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)\n",
1652
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.9/45.9 MB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
1653
+ "\u001b[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)\n",
1654
+ "Installing collected packages: pyee, playwright\n",
1655
+ "Successfully installed playwright-1.55.0 pyee-13.0.0\n",
1656
+ "Collecting playwright-stealth\n",
1657
+ " Downloading playwright_stealth-2.0.0-py3-none-any.whl.metadata (4.0 kB)\n",
1658
+ "Requirement already satisfied: playwright<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from playwright-stealth) (1.55.0)\n",
1659
+ "Requirement already satisfied: pyee<14,>=13 in /usr/local/lib/python3.12/dist-packages (from playwright<2.0.0,>=1.0.0->playwright-stealth) (13.0.0)\n",
1660
+ "Requirement already satisfied: greenlet<4.0.0,>=3.1.1 in /usr/local/lib/python3.12/dist-packages (from playwright<2.0.0,>=1.0.0->playwright-stealth) (3.2.4)\n",
1661
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from pyee<14,>=13->playwright<2.0.0,>=1.0.0->playwright-stealth) (4.15.0)\n",
1662
+ "Downloading playwright_stealth-2.0.0-py3-none-any.whl (32 kB)\n",
1663
+ "Installing collected packages: playwright-stealth\n",
1664
+ "Successfully installed playwright-stealth-2.0.0\n",
1665
  "Downloading Chromium 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip\u001b[22m\n",
1666
+ "\u001b[1G173.7 MiB [] 0% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 0% 9.7s\u001b[0K\u001b[1G173.7 MiB [] 0% 5.8s\u001b[0K\u001b[1G173.7 MiB [] 0% 5.6s\u001b[0K\u001b[1G173.7 MiB [] 1% 6.2s\u001b[0K\u001b[1G173.7 MiB [] 1% 4.2s\u001b[0K\u001b[1G173.7 MiB [] 3% 3.1s\u001b[0K\u001b[1G173.7 MiB [] 4% 2.5s\u001b[0K\u001b[1G173.7 MiB [] 5% 2.4s\u001b[0K\u001b[1G173.7 MiB [] 6% 2.3s\u001b[0K\u001b[1G173.7 MiB [] 7% 2.1s\u001b[0K\u001b[1G173.7 MiB [] 8% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 9% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 10% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 11% 2.0s\u001b[0K\u001b[1G173.7 MiB [] 12% 1.9s\u001b[0K\u001b[1G173.7 MiB [] 14% 1.7s\u001b[0K\u001b[1G173.7 MiB [] 15% 1.6s\u001b[0K\u001b[1G173.7 MiB [] 17% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 18% 1.5s\u001b[0K\u001b[1G173.7 MiB [] 19% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 21% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 22% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 23% 1.4s\u001b[0K\u001b[1G173.7 MiB [] 25% 1.3s\u001b[0K\u001b[1G173.7 MiB [] 26% 1.2s\u001b[0K\u001b[1G173.7 MiB [] 29% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 30% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 31% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 33% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 34% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 35% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 36% 1.1s\u001b[0K\u001b[1G173.7 MiB [] 37% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 39% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 40% 1.0s\u001b[0K\u001b[1G173.7 MiB [] 42% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 44% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 46% 0.9s\u001b[0K\u001b[1G173.7 MiB [] 48% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 49% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 51% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 52% 0.8s\u001b[0K\u001b[1G173.7 MiB [] 54% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 55% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 56% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 57% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 58% 0.7s\u001b[0K\u001b[1G173.7 MiB [] 59% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 60% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 61% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 62% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 63% 0.6s\u001b[0K\u001b[1G173.7 MiB [] 65% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 66% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 67% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 69% 0.5s\u001b[0K\u001b[1G173.7 MiB [] 71% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 73% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 74% 0.4s\u001b[0K\u001b[1G173.7 MiB [] 76% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 78% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 80% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 81% 0.3s\u001b[0K\u001b[1G173.7 MiB [] 83% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 85% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 86% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 87% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 89% 0.2s\u001b[0K\u001b[1G173.7 MiB [] 90% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 92% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 94% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 95% 0.1s\u001b[0K\u001b[1G173.7 MiB [] 97% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 99% 0.0s\u001b[0K\u001b[1G173.7 MiB [] 100% 0.0s\u001b[0K\n",
1667
  "Chromium 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium-1187\n",
1668
  "Downloading Chromium Headless Shell 140.0.7339.16 (playwright build v1187)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-headless-shell-linux.zip\u001b[22m\n",
1669
+ "\u001b[1G104.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 0% 5.7s\u001b[0K\u001b[1G104.3 MiB [] 1% 2.1s\u001b[0K\u001b[1G104.3 MiB [] 2% 1.7s\u001b[0K\u001b[1G104.3 MiB [] 3% 1.6s\u001b[0K\u001b[1G104.3 MiB [] 5% 1.3s\u001b[0K\u001b[1G104.3 MiB [] 7% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 9% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 11% 1.2s\u001b[0K\u001b[1G104.3 MiB [] 12% 1.1s\u001b[0K\u001b[1G104.3 MiB [] 15% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 16% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 18% 1.0s\u001b[0K\u001b[1G104.3 MiB [] 19% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 22% 0.9s\u001b[0K\u001b[1G104.3 MiB [] 24% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 26% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 27% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 28% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 29% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 31% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 32% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 34% 0.8s\u001b[0K\u001b[1G104.3 MiB [] 36% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 37% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 40% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 42% 0.7s\u001b[0K\u001b[1G104.3 MiB [] 44% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 46% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 47% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 48% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 49% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 50% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 51% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 52% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 53% 0.6s\u001b[0K\u001b[1G104.3 MiB [] 54% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 56% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 58% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 60% 0.5s\u001b[0K\u001b[1G104.3 MiB [] 63% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 65% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 67% 0.4s\u001b[0K\u001b[1G104.3 MiB [] 69% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 71% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 73% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 74% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 77% 0.3s\u001b[0K\u001b[1G104.3 MiB [] 79% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 81% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 84% 0.2s\u001b[0K\u001b[1G104.3 MiB [] 86% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 88% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 90% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 91% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 93% 0.1s\u001b[0K\u001b[1G104.3 MiB [] 95% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 98% 0.0s\u001b[0K\u001b[1G104.3 MiB [] 100% 0.0s\u001b[0K\n",
1670
  "Chromium Headless Shell 140.0.7339.16 (playwright build v1187) downloaded to /root/.cache/ms-playwright/chromium_headless_shell-1187\n",
1671
  "Downloading Firefox 141.0 (playwright build v1490)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/firefox/1490/firefox-ubuntu-22.04.zip\u001b[22m\n",
1672
+ "\u001b[1G96 MiB [] 0% 0.0s\u001b[0K\u001b[1G96 MiB [] 0% 5.0s\u001b[0K\u001b[1G96 MiB [] 1% 2.4s\u001b[0K\u001b[1G96 MiB [] 2% 1.7s\u001b[0K\u001b[1G96 MiB [] 4% 1.3s\u001b[0K\u001b[1G96 MiB [] 6% 1.2s\u001b[0K\u001b[1G96 MiB [] 7% 1.1s\u001b[0K\u001b[1G96 MiB [] 9% 1.1s\u001b[0K\u001b[1G96 MiB [] 10% 1.2s\u001b[0K\u001b[1G96 MiB [] 11% 1.1s\u001b[0K\u001b[1G96 MiB [] 12% 1.1s\u001b[0K\u001b[1G96 MiB [] 14% 1.1s\u001b[0K\u001b[1G96 MiB [] 16% 1.0s\u001b[0K\u001b[1G96 MiB [] 18% 1.0s\u001b[0K\u001b[1G96 MiB [] 19% 0.9s\u001b[0K\u001b[1G96 MiB [] 21% 0.9s\u001b[0K\u001b[1G96 MiB [] 23% 0.9s\u001b[0K\u001b[1G96 MiB [] 25% 0.8s\u001b[0K\u001b[1G96 MiB [] 28% 0.7s\u001b[0K\u001b[1G96 MiB [] 30% 0.7s\u001b[0K\u001b[1G96 MiB [] 33% 0.7s\u001b[0K\u001b[1G96 MiB [] 35% 0.6s\u001b[0K\u001b[1G96 MiB [] 36% 0.6s\u001b[0K\u001b[1G96 MiB [] 36% 0.7s\u001b[0K\u001b[1G96 MiB [] 38% 0.7s\u001b[0K\u001b[1G96 MiB [] 39% 0.6s\u001b[0K\u001b[1G96 MiB [] 40% 0.6s\u001b[0K\u001b[1G96 MiB [] 43% 0.6s\u001b[0K\u001b[1G96 MiB [] 46% 0.5s\u001b[0K\u001b[1G96 MiB [] 47% 0.5s\u001b[0K\u001b[1G96 MiB [] 48% 0.6s\u001b[0K\u001b[1G96 MiB [] 50% 0.5s\u001b[0K\u001b[1G96 MiB [] 50% 0.6s\u001b[0K\u001b[1G96 MiB [] 52% 0.5s\u001b[0K\u001b[1G96 MiB [] 53% 0.5s\u001b[0K\u001b[1G96 MiB [] 55% 0.5s\u001b[0K\u001b[1G96 MiB [] 58% 0.5s\u001b[0K\u001b[1G96 MiB [] 61% 0.4s\u001b[0K\u001b[1G96 MiB [] 63% 0.4s\u001b[0K\u001b[1G96 MiB [] 66% 0.3s\u001b[0K\u001b[1G96 MiB [] 69% 0.3s\u001b[0K\u001b[1G96 MiB [] 71% 0.3s\u001b[0K\u001b[1G96 MiB [] 74% 0.3s\u001b[0K\u001b[1G96 MiB [] 77% 0.2s\u001b[0K\u001b[1G96 MiB [] 79% 0.2s\u001b[0K\u001b[1G96 MiB [] 82% 0.2s\u001b[0K\u001b[1G96 MiB [] 86% 0.1s\u001b[0K\u001b[1G96 MiB [] 87% 0.2s\u001b[0K\u001b[1G96 MiB [] 88% 0.2s\u001b[0K\u001b[1G96 MiB [] 89% 0.2s\u001b[0K\u001b[1G96 MiB [] 89% 0.3s\u001b[0K\u001b[1G96 MiB [] 90% 0.3s\u001b[0K\u001b[1G96 MiB [] 91% 0.3s\u001b[0K\u001b[1G96 MiB [] 95% 0.1s\u001b[0K\u001b[1G96 MiB [] 96% 0.1s\u001b[0K\u001b[1G96 MiB [] 97% 0.1s\u001b[0K\u001b[1G96 MiB [] 98% 0.0s\u001b[0K\u001b[1G96 MiB [] 99% 0.0s\u001b[0K\u001b[1G96 MiB [] 100% 0.0s\u001b[0K\n",
1673
  "Firefox 141.0 (playwright build v1490) downloaded to /root/.cache/ms-playwright/firefox-1490\n",
1674
  "Downloading Webkit 26.0 (playwright build v2203)\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/webkit/2203/webkit-ubuntu-22.04.zip\u001b[22m\n",
1675
+ "\u001b[1G94.6 MiB [] 0% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 0% 23.5s\u001b[0K\u001b[1G94.6 MiB [] 0% 22.1s\u001b[0K\u001b[1G94.6 MiB [] 0% 13.5s\u001b[0K\u001b[1G94.6 MiB [] 0% 12.0s\u001b[0K\u001b[1G94.6 MiB [] 0% 11.6s\u001b[0K\u001b[1G94.6 MiB [] 1% 10.0s\u001b[0K\u001b[1G94.6 MiB [] 1% 9.0s\u001b[0K\u001b[1G94.6 MiB [] 1% 8.4s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.9s\u001b[0K\u001b[1G94.6 MiB [] 2% 8.0s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.6s\u001b[0K\u001b[1G94.6 MiB [] 2% 7.1s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.9s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.5s\u001b[0K\u001b[1G94.6 MiB [] 3% 6.6s\u001b[0K\u001b[1G94.6 MiB [] 4% 6.7s\u001b[0K\u001b[1G94.6 MiB [] 5% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 5% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 6% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 6% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 7% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 7% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 8% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 9% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 9% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 10% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 10% 5.8s\u001b[0K\u001b[1G94.6 MiB [] 10% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 11% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 11% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 12% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 13% 6.0s\u001b[0K\u001b[1G94.6 MiB [] 13% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 14% 5.9s\u001b[0K\u001b[1G94.6 MiB [] 15% 5.8s\u001b[0K\u001b[1G94.6 MiB [] 15% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 16% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.7s\u001b[0K\u001b[1G94.6 MiB [] 17% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 18% 5.6s\u001b[0K\u001b[1G94.6 MiB [] 18% 5.5s\u001b[0K\u001b[1G94.6 MiB [] 19% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 19% 5.3s\u001b[0K\u001b[1G94.6 MiB [] 20% 5.3s\u001b[0K\u001b[1G94.6 MiB [] 20% 5.4s\u001b[0K\u001b[1G94.6 MiB [] 21% 5.2s\u001b[0K\u001b[1G94.6 MiB [] 21% 5.1s\u001b[0K\u001b[1G94.6 MiB [] 22% 5.0s\u001b[0K\u001b[1G94.6 MiB [] 23% 4.8s\u001b[0K\u001b[1G94.6 MiB [] 23% 4.7s\u001b[0K\u001b[1G94.6 MiB [] 24% 4.6s\u001b[0K\u001b[1G94.6 MiB [] 25% 4.5s\u001b[0K\u001b[1G94.6 MiB [] 26% 4.4s\u001b[0K\u001b[1G94.6 MiB [] 26% 4.3s\u001b[0K\u001b[1G94.6 MiB [] 27% 4.3s\u001b[0K\u001b[1G94.6 MiB [] 28% 4.2s\u001b[0K\u001b[1G94.6 MiB [] 29% 4.1s\u001b[0K\u001b[1G94.6 MiB [] 29% 4.0s\u001b[0K\u001b[1G94.6 MiB [] 30% 4.0s\u001b[0K\u001b[1G94.6 MiB [] 31% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 32% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 33% 3.9s\u001b[0K\u001b[1G94.6 MiB [] 33% 3.8s\u001b[0K\u001b[1G94.6 MiB [] 34% 3.8s\u001b[0K\u001b[1G94.6 MiB [] 35% 3.7s\u001b[0K\u001b[1G94.6 MiB [] 36% 3.6s\u001b[0K\u001b[1G94.6 MiB [] 36% 3.5s\u001b[0K\u001b[1G94.6 MiB [] 37% 3.5s\u001b[0K\u001b[1G94.6 MiB [] 38% 3.4s\u001b[0K\u001b[1G94.6 MiB [] 39% 3.4s\u001b[0K\u001b[1G94.6 MiB [] 39% 3.3s\u001b[0K\u001b[1G94.6 MiB [] 40% 3.3s\u001b[0K\u001b[1G94.6 MiB [] 40% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 41% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 42% 3.2s\u001b[0K\u001b[1G94.6 MiB [] 42% 3.1s\u001b[0K\u001b[1G94.6 MiB [] 43% 3.0s\u001b[0K\u001b[1G94.6 MiB [] 44% 2.9s\u001b[0K\u001b[1G94.6 MiB [] 45% 2.8s\u001b[0K\u001b[1G94.6 MiB [] 46% 2.8s\u001b[0K\u001b[1G94.6 MiB [] 47% 2.7s\u001b[0K\u001b[1G94.6 MiB [] 48% 2.7s\u001b[0K\u001b[1G94.6 MiB [] 48% 2.6s\u001b[0K\u001b[1G94.6 MiB [] 49% 2.6s\u001b[0K\u001b[1G94.6 MiB [] 50% 2.5s\u001b[0K\u001b[1G94.6 MiB [] 51% 2.5s\u001b[0K\u001b[1G94.6 MiB [] 52% 2.4s\u001b[0K\u001b[1G94.6 MiB [] 53% 2.3s\u001b[0K\u001b[1G94.6 MiB [] 54% 2.2s\u001b[0K\u001b[1G94.6 MiB [] 55% 2.2s\u001b[0K\u001b[1G94.6 MiB [] 56% 2.1s\u001b[0K\u001b[1G94.6 MiB [] 57% 2.1s\u001b[0K\u001b[1G94.6 MiB [] 58% 2.0s\u001b[0K\u001b[1G94.6 MiB [] 59% 1.9s\u001b[0K\u001b[1G94.6 MiB [] 60% 1.8s\u001b[0K\u001b[1G94.6 MiB [] 61% 1.8s\u001b[0K\u001b[1G94.6 MiB [] 62% 1.7s\u001b[0K\u001b[1G94.6 MiB [] 63% 1.7s\u001b[0K\u001b[1G94.6 MiB [] 64% 1.6s\u001b[0K\u001b[1G94.6 MiB [] 65% 1.5s\u001b[0K\u001b[1G94.6 MiB [] 67% 1.5s\u001b[0K\u001b[1G94.6 MiB [] 67% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 68% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 69% 1.4s\u001b[0K\u001b[1G94.6 MiB [] 69% 1.3s\u001b[0K\u001b[1G94.6 MiB [] 70% 1.3s\u001b[0K\u001b[1G94.6 MiB [] 71% 1.2s\u001b[0K\u001b[1G94.6 MiB [] 72% 1.2s\u001b[0K\u001b[1G94.6 MiB [] 73% 1.1s\u001b[0K\u001b[1G94.6 MiB [] 74% 1.1s\u001b[0K\u001b[1G94.6 MiB [] 75% 1.0s\u001b[0K\u001b[1G94.6 MiB [] 76% 1.0s\u001b[0K\u001b[1G94.6 MiB [] 77% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 78% 0.9s\u001b[0K\u001b[1G94.6 MiB [] 79% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 80% 0.8s\u001b[0K\u001b[1G94.6 MiB [] 81% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 82% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 83% 0.7s\u001b[0K\u001b[1G94.6 MiB [] 84% 0.6s\u001b[0K\u001b[1G94.6 MiB [] 86% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 87% 0.5s\u001b[0K\u001b[1G94.6 MiB [] 88% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 89% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 90% 0.4s\u001b[0K\u001b[1G94.6 MiB [] 91% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 92% 0.3s\u001b[0K\u001b[1G94.6 MiB [] 94% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 95% 0.2s\u001b[0K\u001b[1G94.6 MiB [] 97% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 98% 0.1s\u001b[0K\u001b[1G94.6 MiB [] 99% 0.0s\u001b[0K\u001b[1G94.6 MiB [] 100% 0.0s\u001b[0K\n",
1676
  "Webkit 26.0 (playwright build v2203) downloaded to /root/.cache/ms-playwright/webkit-2203\n",
1677
  "Downloading FFMPEG playwright build v1011\u001b[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/ffmpeg/1011/ffmpeg-linux.zip\u001b[22m\n",
1678
+ "\u001b[1G2.3 MiB [] 0% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 11% 0.1s\u001b[0K\u001b[1G2.3 MiB [] 48% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 90% 0.0s\u001b[0K\u001b[1G2.3 MiB [] 100% 0.0s\u001b[0K\n",
1679
  "FFMPEG playwright build v1011 downloaded to /root/.cache/ms-playwright/ffmpeg-1011\n",
1680
  "Playwright Host validation warning: \n",
1681
  "╔══════════════════════════════════════════════════════╗\n",
 
1692
  "║ libmanette-0.2.so.0 ║\n",
1693
  "╚══════════════════════════════════════════════════════╝\n",
1694
  " at validateDependenciesLinux (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)\n",
 
1695
  " at async Registry._validateHostRequirements (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:934:14)\n",
1696
  " at async Registry._validateHostRequirementsForExecutableIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1056:7)\n",
1697
  " at async Registry.validateHostRequirementsForExecutablesIfNeeded (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/server/registry/index.js:1045:7)\n",
1698
  " at async i.<anonymous> (/usr/local/lib/python3.12/dist-packages/playwright/driver/package/lib/cli/program.js:217:7)\n",
1699
  "Installing dependencies...\n",
1700
  "Hit:1 https://cli.github.com/packages stable InRelease\n",
1701
+ "Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
1702
+ "Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
1703
+ "Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
1704
+ "Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n",
1705
+ "Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n",
1706
+ "Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
1707
+ "Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
1708
+ "Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
1709
+ "Get:10 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,287 kB]\n",
1710
+ "Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
1711
+ "Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,812 kB]\n",
1712
+ "Get:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
1713
+ "Get:14 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,372 kB]\n",
1714
+ "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,778 kB]\n",
1715
+ "Get:16 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [69.2 kB]\n",
1716
+ "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,594 kB]\n",
1717
+ "Get:18 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [5,988 kB]\n",
1718
+ "Fetched 25.3 MB in 3s (9,111 kB/s)\n",
 
 
1719
  "Reading package lists... Done\n",
1720
  "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
1721
  "Reading package lists... Done\n",
 
1874
  " libwildmidi2 libwoff1 libxtst6 libyuv0 libzbar0 libzxingcore1\n",
1875
  " session-migration timgm6mb-soundfont xfonts-cyrillic xfonts-encodings\n",
1876
  " xfonts-scalable xfonts-utils\n",
1877
+ "0 upgraded, 94 newly installed, 0 to remove and 40 not upgraded.\n",
1878
  "Need to get 48.2 MB of archives.\n",
1879
  "After this operation, 123 MB of additional disk space will be used.\n",
1880
  "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-ipafont-gothic all 00303-21ubuntu1 [3,513 kB]\n",
 
1971
  "Get:92 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-aacenc0 amd64 0.1.3-2 [69.4 kB]\n",
1972
  "Get:93 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libvo-amrwbenc0 amd64 0.1.3-2 [68.2 kB]\n",
1973
  "Get:94 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 gstreamer1.0-plugins-bad amd64 1.20.3-0ubuntu1.1 [2,602 kB]\n",
1974
+ "Fetched 48.2 MB in 5s (10.3 MB/s)\n",
1975
  "Extracting templates from packages: 100%\n",
1976
  "Preconfiguring packages ...\n",
1977
  "Selecting previously unselected package fonts-ipafont-gothic.\n",
 
2351
  "Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...\n",
2352
  "Processing triggers for libglib2.0-0:amd64 (2.72.4-0ubuntu2.6) ...\n",
2353
  "Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
2354
+ "/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
2355
  "\n",
2356
+ "/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
2357
  "\n",
2358
+ "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
2359
  "\n",
2360
+ "/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
2361
  "\n",
2362
  "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
2363
  "\n",
2364
+ "/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
2365
  "\n",
2366
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
2367
  "\n",
2368
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
2369
  "\n",
2370
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
2371
  "\n",
2372
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
2373
  "\n",
2374
  "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
2375
  "\n",
2376
+ "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
2377
  "\n",
2378
+ "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
2379
  "\n",
2380
+ "/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
2381
  "\n",
2382
  "Setting up glib-networking:amd64 (2.72.0-1) ...\n",
2383
  "Setting up libsoup2.4-1:amd64 (2.74.2-3ubuntu0.6) ...\n",
 
2390
  "Setting up gstreamer1.0-plugins-bad:amd64 (1.20.3-0ubuntu1.1) ...\n",
2391
  "Processing triggers for dictionaries-common (1.28.14) ...\n",
2392
  "Processing triggers for libc-bin (2.35-0ubuntu3.8) ...\n",
2393
+ "/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link\n",
2394
  "\n",
2395
+ "/sbin/ldconfig.real: /usr/local/lib/libhwloc.so.15 is not a symbolic link\n",
2396
  "\n",
2397
+ "/sbin/ldconfig.real: /usr/local/lib/libtbbbind.so.3 is not a symbolic link\n",
2398
  "\n",
2399
+ "/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link\n",
2400
  "\n",
2401
  "/sbin/ldconfig.real: /usr/local/lib/libtbb.so.12 is not a symbolic link\n",
2402
  "\n",
2403
+ "/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link\n",
2404
  "\n",
2405
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_0.so.3 is not a symbolic link\n",
2406
  "\n",
2407
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero_v2.so.0 is not a symbolic link\n",
2408
  "\n",
2409
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_opencl.so.0 is not a symbolic link\n",
2410
  "\n",
2411
+ "/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link\n",
2412
  "\n",
2413
  "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc.so.2 is not a symbolic link\n",
2414
  "\n",
2415
+ "/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link\n",
2416
+ "\n",
2417
  "/sbin/ldconfig.real: /usr/local/lib/libtbbbind_2_5.so.3 is not a symbolic link\n",
2418
  "\n",
2419
+ "/sbin/ldconfig.real: /usr/local/lib/libur_loader.so.0 is not a symbolic link\n",
2420
  "\n",
2421
+ "Collecting pdfplumber\n",
2422
+ " Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)\n",
2423
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2424
+ "\u001b[?25hCollecting pdfminer.six==20250506 (from pdfplumber)\n",
2425
+ " Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)\n",
2426
+ "Requirement already satisfied: Pillow>=9.1 in /usr/local/lib/python3.12/dist-packages (from pdfplumber) (11.3.0)\n",
2427
+ "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n",
2428
+ " Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)\n",
2429
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2430
+ "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (3.4.4)\n",
2431
+ "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.12/dist-packages (from pdfminer.six==20250506->pdfplumber) (43.0.3)\n",
2432
+ "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.0.0)\n",
2433
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20250506->pdfplumber) (2.23)\n",
2434
+ "Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)\n",
2435
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2436
+ "\u001b[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)\n",
2437
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m88.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2438
+ "\u001b[?25hDownloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
2439
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m120.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2440
+ "\u001b[?25hInstalling collected packages: pypdfium2, pdfminer.six, pdfplumber\n",
2441
+ "Successfully installed pdfminer.six-20250506 pdfplumber-0.11.7 pypdfium2-4.30.0\n",
2442
+ "Collecting trafilatura\n",
2443
+ " Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)\n",
2444
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from trafilatura) (2025.10.5)\n",
2445
+ "Requirement already satisfied: charset_normalizer>=3.4.0 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (3.4.4)\n",
2446
+ "Collecting courlan>=1.3.2 (from trafilatura)\n",
2447
+ " Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)\n",
2448
+ "Collecting htmldate>=1.9.2 (from trafilatura)\n",
2449
+ " Downloading htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)\n",
2450
+ "Collecting justext>=3.0.1 (from trafilatura)\n",
2451
+ " Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)\n",
2452
+ "Requirement already satisfied: lxml>=5.3.0 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (5.4.0)\n",
2453
+ "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.12/dist-packages (from trafilatura) (2.5.0)\n",
2454
+ "Requirement already satisfied: babel>=2.16.0 in /usr/local/lib/python3.12/dist-packages (from courlan>=1.3.2->trafilatura) (2.17.0)\n",
2455
+ "Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)\n",
2456
+ " Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)\n",
2457
+ "Collecting dateparser>=1.1.2 (from htmldate>=1.9.2->trafilatura)\n",
2458
+ " Downloading dateparser-1.2.2-py3-none-any.whl.metadata (29 kB)\n",
2459
+ "Requirement already satisfied: python-dateutil>=2.9.0.post0 in /usr/local/lib/python3.12/dist-packages (from htmldate>=1.9.2->trafilatura) (2.9.0.post0)\n",
2460
+ "Requirement already satisfied: pytz>=2024.2 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (2025.2)\n",
2461
+ "Requirement already satisfied: regex>=2024.9.11 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (2024.11.6)\n",
2462
+ "Requirement already satisfied: tzlocal>=0.2 in /usr/local/lib/python3.12/dist-packages (from dateparser>=1.1.2->htmldate>=1.9.2->trafilatura) (5.3.1)\n",
2463
+ "Collecting lxml_html_clean (from lxml[html_clean]>=4.4.2->justext>=3.0.1->trafilatura)\n",
2464
+ " Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)\n",
2465
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.9.0.post0->htmldate>=1.9.2->trafilatura) (1.17.0)\n",
2466
+ "Downloading trafilatura-2.0.0-py3-none-any.whl (132 kB)\n",
2467
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.6/132.6 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2468
+ "\u001b[?25hDownloading courlan-1.3.2-py3-none-any.whl (33 kB)\n",
2469
+ "Downloading htmldate-1.9.3-py3-none-any.whl (31 kB)\n",
2470
+ "Downloading justext-3.0.2-py2.py3-none-any.whl (837 kB)\n",
2471
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m837.9/837.9 kB\u001b[0m \u001b[31m32.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2472
+ "\u001b[?25hDownloading dateparser-1.2.2-py3-none-any.whl (315 kB)\n",
2473
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m315.5/315.5 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2474
+ "\u001b[?25hDownloading tld-0.13.1-py2.py3-none-any.whl (274 kB)\n",
2475
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
2476
+ "\u001b[?25hDownloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)\n",
2477
+ "Installing collected packages: tld, lxml_html_clean, dateparser, courlan, justext, htmldate, trafilatura\n",
2478
+ "Successfully installed courlan-1.3.2 dateparser-1.2.2 htmldate-1.9.3 justext-3.0.2 lxml_html_clean-0.4.3 tld-0.13.1 trafilatura-2.0.0\n"
2479
+ ]
2480
+ }
2481
+ ]
2482
+ },
2483
+ {
2484
+ "cell_type": "code",
2485
+ "execution_count": 3,
2486
+ "id": "wVdx5j24HKcp",
2487
+ "metadata": {
2488
+ "colab": {
2489
+ "base_uri": "https://localhost:8080/"
2490
+ },
2491
+ "id": "wVdx5j24HKcp",
2492
+ "outputId": "01d9c03a-d9d5-43b2-9626-e29ed85360d2"
2493
+ },
2494
+ "outputs": [
2495
+ {
2496
+ "output_type": "stream",
2497
+ "name": "stdout",
2498
+ "text": [
2499
+ "Installing dependencies...\n",
2500
+ "Hit:1 https://cli.github.com/packages stable InRelease\n",
2501
+ "Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease\n",
2502
+ "Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
2503
+ "Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
2504
+ "Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease\n",
2505
+ "Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease\n",
2506
+ "Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease\n",
2507
+ "Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease\n",
2508
+ "Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
2509
+ "Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
2510
+ "Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
2511
+ "Reading package lists... Done\n",
2512
+ "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
2513
+ "Reading package lists... Done\n",
2514
+ "Building dependency tree... Done\n",
2515
+ "Reading state information... Done\n",
2516
+ "fonts-freefont-ttf is already the newest version (20120503-10build1).\n",
2517
+ "fonts-liberation is already the newest version (1:1.07.4-11).\n",
2518
+ "libasound2 is already the newest version (1.2.6.1-1ubuntu1).\n",
2519
+ "libatk-bridge2.0-0 is already the newest version (2.38.0-3).\n",
2520
+ "libatk1.0-0 is already the newest version (2.36.0-3build1).\n",
2521
+ "libatspi2.0-0 is already the newest version (2.44.0-3).\n",
2522
+ "libcairo-gobject2 is already the newest version (1.16.0-5ubuntu2).\n",
2523
+ "libcairo2 is already the newest version (1.16.0-5ubuntu2).\n",
2524
+ "libdbus-glib-1-2 is already the newest version (0.112-2build1).\n",
2525
+ "libegl1 is already the newest version (1.4.0-1).\n",
2526
+ "libenchant-2-2 is already the newest version (2.3.2-1ubuntu2).\n",
2527
+ "libepoxy0 is already the newest version (1.5.10-1).\n",
2528
+ "libevdev2 is already the newest version (1.12.1+dfsg-1).\n",
2529
+ "libevent-2.1-7 is already the newest version (2.1.12-stable-1build3).\n",
2530
+ "libfontconfig1 is already the newest version (2.13.1-4.2ubuntu5).\n",
2531
+ "libgles2 is already the newest version (1.4.0-1).\n",
2532
+ "libglx0 is already the newest version (1.4.0-1).\n",
2533
+ "libgudev-1.0-0 is already the newest version (1:237-2build1).\n",
2534
+ "libhyphen0 is already the newest version (2.8.8-7build2).\n",
2535
+ "libicu70 is already the newest version (70.1-2).\n",
2536
+ "libjpeg-turbo8 is already the newest version (2.1.2-0ubuntu1).\n",
2537
+ "liblcms2-2 is already the newest version (2.12~rc1-2build2).\n",
2538
+ "libmanette-0.2-0 is already the newest version (0.2.6-3build1).\n",
2539
+ "libopengl0 is already the newest version (1.4.0-1).\n",
2540
+ "libopus0 is already the newest version (1.3.1-0.1build2).\n",
2541
+ "libpng16-16 is already the newest version (1.6.37-3build5).\n",
2542
+ "libproxy1v5 is already the newest version (0.4.17-2).\n",
2543
+ "libsecret-1-0 is already the newest version (0.20.5-2).\n",
2544
+ "libwoff1 is already the newest version (1.0.2-1build4).\n",
2545
+ "libxcb-shm0 is already the newest version (1.14-3ubuntu3).\n",
2546
+ "libxcb1 is already the newest version (1.14-3ubuntu3).\n",
2547
+ "libxcomposite1 is already the newest version (1:0.4.5-1build2).\n",
2548
+ "libxcursor1 is already the newest version (1:1.2.0-2build4).\n",
2549
+ "libxdamage1 is already the newest version (1:1.1.5-2build2).\n",
2550
+ "libxext6 is already the newest version (2:1.3.4-1build1).\n",
2551
+ "libxfixes3 is already the newest version (1:6.0.0-1).\n",
2552
+ "libxi6 is already the newest version (2:1.8-1build1).\n",
2553
+ "libxkbcommon0 is already the newest version (1.4.0-1).\n",
2554
+ "libxrandr2 is already the newest version (2:1.5.2-1build1).\n",
2555
+ "libxrender1 is already the newest version (1:0.9.10-1build4).\n",
2556
+ "libxtst6 is already the newest version (2:1.2.3-1build4).\n",
2557
+ "xfonts-scalable is already the newest version (1:1.0.3-1.2ubuntu1).\n",
2558
+ "fonts-ipafont-gothic is already the newest version (00303-21ubuntu1).\n",
2559
+ "fonts-tlwg-loma-otf is already the newest version (1:0.7.3-1).\n",
2560
+ "fonts-unifont is already the newest version (1:14.0.01-1).\n",
2561
+ "fonts-wqy-zenhei is already the newest version (0.9.45-8).\n",
2562
+ "libavif13 is already the newest version (0.9.3-3).\n",
2563
+ "libffi7 is already the newest version (3.3-5ubuntu1).\n",
2564
+ "libx264-163 is already the newest version (2:0.163.3060+git5db6aa6-2build1).\n",
2565
+ "xfonts-cyrillic is already the newest version (1:1.0.5).\n",
2566
+ "fonts-noto-color-emoji is already the newest version (2.047-0ubuntu0.22.04.1).\n",
2567
+ "gstreamer1.0-plugins-base is already the newest version (1.20.1-1ubuntu0.5).\n",
2568
+ "gstreamer1.0-plugins-good is already the newest version (1.20.3-0ubuntu1.4).\n",
2569
+ "libatomic1 is already the newest version (12.3.0-1ubuntu1~22.04.2).\n",
2570
+ "libcups2 is already the newest version (2.4.1op1-1ubuntu4.12).\n",
2571
+ "libdbus-1-3 is already the newest version (1.12.20-2ubuntu4.1).\n",
2572
+ "libdrm2 is already the newest version (2.4.113-2~ubuntu0.22.04.1).\n",
2573
+ "libfreetype6 is already the newest version (2.11.1+dfsg-1ubuntu0.3).\n",
2574
+ "libgbm1 is already the newest version (23.2.1-1ubuntu3.1~22.04.3).\n",
2575
+ "libgdk-pixbuf-2.0-0 is already the newest version (2.42.8+dfsg-1ubuntu0.4).\n",
2576
+ "libglib2.0-0 is already the newest version (2.72.4-0ubuntu2.6).\n",
2577
+ "libgstreamer-gl1.0-0 is already the newest version (1.20.1-1ubuntu0.5).\n",
2578
+ "libgstreamer-plugins-base1.0-0 is already the newest version (1.20.1-1ubuntu0.5).\n",
2579
+ "libgstreamer1.0-0 is already the newest version (1.20.3-0ubuntu1.1).\n",
2580
+ "libgtk-3-0 is already the newest version (3.24.33-1ubuntu2.2).\n",
2581
+ "libgtk-4-1 is already the newest version (4.6.9+ds-0ubuntu0.22.04.2).\n",
2582
+ "libharfbuzz-icu0 is already the newest version (2.7.4-1ubuntu3.2).\n",
2583
+ "libharfbuzz0b is already the newest version (2.7.4-1ubuntu3.2).\n",
2584
+ "libnotify4 is already the newest version (0.7.9-3ubuntu5.22.04.1).\n",
2585
+ "libnspr4 is already the newest version (2:4.35-0ubuntu0.22.04.1).\n",
2586
+ "libnss3 is already the newest version (2:3.98-0ubuntu0.22.04.2).\n",
2587
+ "libopenjp2-7 is already the newest version (2.4.0-6ubuntu0.4).\n",
2588
+ "libpango-1.0-0 is already the newest version (1.50.6+ds-2ubuntu1).\n",
2589
+ "libpangocairo-1.0-0 is already the newest version (1.50.6+ds-2ubuntu1).\n",
2590
+ "libwayland-client0 is already the newest version (1.20.0-1ubuntu0.1).\n",
2591
+ "libwayland-egl1 is already the newest version (1.20.0-1ubuntu0.1).\n",
2592
+ "libwayland-server0 is already the newest version (1.20.0-1ubuntu0.1).\n",
2593
+ "libwebpdemux2 is already the newest version (1.2.2-2ubuntu0.22.04.2).\n",
2594
+ "libx11-6 is already the newest version (2:1.7.5-1ubuntu0.3).\n",
2595
+ "libx11-xcb1 is already the newest version (2:1.7.5-1ubuntu0.3).\n",
2596
+ "libxml2 is already the newest version (2.9.13+dfsg-1ubuntu0.9).\n",
2597
+ "libxslt1.1 is already the newest version (1.1.34-4ubuntu0.22.04.4).\n",
2598
+ "ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).\n",
2599
+ "gstreamer1.0-libav is already the newest version (1.20.3-0ubuntu1).\n",
2600
+ "gstreamer1.0-plugins-bad is already the newest version (1.20.3-0ubuntu1.1).\n",
2601
+ "libsoup-3.0-0 is already the newest version (3.0.7-0ubuntu1).\n",
2602
+ "xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.15).\n",
2603
+ "0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.\n"
2604
  ]
2605
  }
2606
  ],
 
2611
  },
2612
  {
2613
  "cell_type": "code",
2614
+ "execution_count": 6,
2615
  "id": "uNgeNVFoMErV",
2616
  "metadata": {
2617
  "id": "uNgeNVFoMErV"
 
2619
  "outputs": [],
2620
  "source": [
2621
  "import requests\n",
2622
+ "from pdfminer.high_level import extract_text\n",
2623
+ "import asyncio\n",
2624
+ "import aiohttp\n",
2625
+ "import json"
2626
  ]
2627
  },
2628
  {
2629
  "cell_type": "code",
2630
+ "execution_count": 7,
2631
  "id": "tvAnpg8zMA08",
2632
  "metadata": {
2633
  "id": "tvAnpg8zMA08"
2634
  },
2635
  "outputs": [],
2636
  "source": [
2637
+ "async def extract_text_from_pdf(url: str, session: aiohttp.ClientSession) -> str | None:\n",
2638
  " \"\"\"\n",
2639
+ " Tải file PDF từ URL và trích xuất văn bản nhanh bằng pdfminer.six (dùng session async)\n",
2640
  " \"\"\"\n",
2641
+ " print(f\" -> Detect PDF link. Handle by pdfminer.six: {url}\")\n",
2642
  " try:\n",
2643
+ " async with session.get(url, timeout=60) as response:\n",
2644
+ " if response.status != 200:\n",
2645
+ " print(f\"❌ Failed to download PDF ({response.status})\")\n",
2646
+ " return None\n",
2647
  "\n",
2648
+ " data = await response.read()\n",
 
 
 
 
 
 
2649
  "\n",
2650
+ " text = extract_text(io.BytesIO(data))\n",
2651
+ " print(\"✅ SUCCESS! Extracted text using pdfminer.six ---\")\n",
2652
+ " return text.strip()\n",
2653
  "\n",
2654
  " except Exception as e:\n",
2655
+ " print(f\"Error when open file PDF {url}: {e}\")\n",
2656
  " return None"
2657
  ]
2658
  },
2659
  {
2660
  "cell_type": "code",
2661
+ "execution_count": 16,
2662
  "id": "xSKWwAbIBwTu",
2663
  "metadata": {
2664
  "id": "xSKWwAbIBwTu"
 
2666
  "outputs": [],
2667
  "source": [
2668
  "from playwright.async_api import async_playwright\n",
2669
+ "from playwright_stealth import Stealth\n",
2670
  "import trafilatura"
2671
  ]
2672
  },
2673
  {
2674
  "cell_type": "code",
2675
+ "execution_count": 17,
2676
  "id": "6QF-79pKSBw1",
2677
  "metadata": {
2678
  "id": "6QF-79pKSBw1"
2679
  },
2680
  "outputs": [],
2681
  "source": [
2682
+ "async def extract_text_from_web(url: str, session: aiohttp.ClientSession) -> str | None:\n",
2683
  " \"\"\"\n",
2684
+ " Thử dùng trafilatura để trích xuất nội dung trước, nếu lỗi\n",
2685
+ " thì dùng Playwright hoặc Stealth Playwright để lấy nội dung HTML.\n",
2686
  " \"\"\"\n",
2687
+ " print(f\" -> Detect web link. Handle by Trafilatura: {url}\")\n",
2688
  "\n",
 
2689
  " try:\n",
2690
+ " # Fetch HTML bằng session (nhanh hơn nhiều so với trafilatura.fetch_url)\n",
2691
+ " async with session.get(url, timeout=30) as resp:\n",
2692
+ " if resp.status != 200:\n",
2693
+ " raise ValueError(f\"HTTP {resp.status}\")\n",
2694
+ " html_content = await resp.text()\n",
2695
  "\n",
2696
+ " text = trafilatura.extract(html_content)\n",
2697
+ " if text:\n",
2698
+ " print(\"✅ SUCCESS! Extracted text using Trafilatura ---\")\n",
2699
+ " return text\n",
2700
  "\n",
2701
+ " raise ValueError(\"Trafilatura extraction return None\")\n",
 
2702
  "\n",
2703
+ " except Exception as e:\n",
2704
+ " print(f\"⚠️ Error using Trafilatura for URL {url}: {e}\")\n",
2705
+ " print(f\"Falling back to Playwright extraction ...\")\n",
2706
  "\n",
2707
+ " try:\n",
2708
+ " async with async_playwright() as p:\n",
2709
+ " browser = await p.chromium.launch(headless=True)\n",
2710
+ " page = await browser.new_page()\n",
2711
  "\n",
2712
+ " await page.goto(url, timeout=15000, wait_until=\"domcontentloaded\")\n",
2713
+ " html_content = await page.content()\n",
2714
+ " await browser.close()\n",
2715
+ "\n",
2716
+ " if not html_content:\n",
2717
+ " return None\n",
2718
+ "\n",
2719
+ " # Nếu bị Cloudflare block, fallback stealth\n",
2720
+ " if \"Cloudflare Ray ID\" in html_content:\n",
2721
+ " print(\"⚠️ Detected Cloudflare! Retrying with Stealth...\")\n",
2722
+ " async with Stealth().use_async(async_playwright()) as p2:\n",
2723
+ " browser = await p2.chromium.launch(headless=True)\n",
2724
+ " page = await browser.new_page()\n",
2725
+ " await page.goto(url, timeout=20000)\n",
2726
+ " html_content = await page.content()\n",
2727
+ " await browser.close()\n",
2728
+ "\n",
2729
+ " main_text = trafilatura.extract(html_content, include_comments=False)\n",
2730
+ " print(f\"✅ SUCCESS! Extracted text using Playwright ---\")\n",
2731
+ " return main_text\n",
2732
+ "\n",
2733
+ " except Exception as e1:\n",
2734
+ " print(f\"❌ Error using Playwright extraction for URL {url}: {e1}\")\n",
2735
+ " return None"
2736
  ]
2737
  },
2738
  {
2739
  "cell_type": "code",
2740
+ "execution_count": 10,
2741
  "id": "rL1vDTvHMwAj",
2742
  "metadata": {
2743
  "id": "rL1vDTvHMwAj"
2744
  },
2745
  "outputs": [],
2746
  "source": [
2747
+ "async def fetch_content_from_url(url: str, session: aiohttp.ClientSession) -> str | None:\n",
2748
  " \"\"\"\n",
2749
  " Hàm điều phối: Kiểm tra loại URL và gọi hàm xử lý tương ứng.\n",
2750
  " \"\"\"\n",
 
2751
  " if url.lower().endswith('.pdf'):\n",
2752
+ " return await extract_text_from_pdf(url, session)\n",
2753
  " else:\n",
2754
+ " return await extract_text_from_web(url, session)"
 
2755
  ]
2756
  },
2757
  {
2758
  "cell_type": "code",
2759
+ "execution_count": 11,
2760
  "id": "w3y0tq_pLIXu",
2761
  "metadata": {
2762
  "id": "w3y0tq_pLIXu"
 
2793
  },
2794
  {
2795
  "cell_type": "code",
2796
+ "execution_count": 12,
2797
  "id": "0eJs0RfoBz5o",
2798
  "metadata": {
2799
  "id": "0eJs0RfoBz5o"
2800
  },
2801
  "outputs": [],
2802
  "source": [
2803
+ "import json\n",
2804
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
2805
+ "\n",
2806
  "def chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50) -> list[str]:\n",
2807
  " \"\"\"Hàm tiện ích để chia văn bản dài thành các chunk nhỏ hơn.\"\"\"\n",
2808
  " text_splitter = RecursiveCharacterTextSplitter(\n",
 
2815
  },
2816
  {
2817
  "cell_type": "code",
2818
+ "execution_count": 13,
2819
+ "id": "BsLJSWmbHrPf",
2820
  "metadata": {
2821
+ "id": "BsLJSWmbHrPf"
2822
  },
2823
  "outputs": [],
2824
  "source": [
2825
+ "async def process_claims_parallel(retrieved_data):\n",
2826
+ " evidence_by_claim = {}\n",
2827
+ " claims = list(retrieved_data.keys())\n",
2828
+ "\n",
2829
+ " async with aiohttp.ClientSession() as session:\n",
2830
+ " for claim in claims:\n",
2831
+ " print(f\"\\n{'='*50}\\nHandle claim: '{claim}'\")\n",
2832
+ " documents = retrieved_data[claim]\n",
2833
+ " all_chunks_for_this_claim = []\n",
2834
+ "\n",
2835
+ " # Gom tất cả link cần crawl\n",
2836
+ " urls = [doc['link'] for doc in documents]\n",
2837
+ "\n",
2838
+ " # Chạy crawl song song\n",
2839
+ " print(f\" -> Crawling {len(urls)} links in parallel...\")\n",
2840
+ " tasks = [fetch_content_from_url(u, session) for u in urls]\n",
2841
+ " full_contents = await asyncio.gather(*tasks)\n",
2842
+ "\n",
2843
+ " # Ghép kết quả với từng doc\n",
2844
+ " for doc, full_content in zip(documents, full_contents):\n",
2845
+ " content_to_process = \"\"\n",
2846
+ "\n",
2847
+ " if full_content and len(full_content) > 100:\n",
2848
+ " print(f\"SUCCESS!! {doc['link']}\")\n",
2849
+ " cleaned_full_content = clean_text(full_content)\n",
2850
+ " content_to_process = f\"{doc.get('title', '')}. {cleaned_full_content}\"\n",
2851
+ " else:\n",
2852
+ " print(f\"FAIL!! Using snippet for {doc['link']}\")\n",
2853
+ " cleaned_snippet = clean_text(doc.get('snippet', ''))\n",
2854
+ " content_to_process = f\"{doc.get('title', '')}. {cleaned_snippet}\"\n",
2855
+ "\n",
2856
+ " # Chia nhỏ nội dung\n",
2857
+ " chunks = chunk_text(content_to_process)\n",
2858
+ "\n",
2859
+ " # Lưu lại\n",
2860
+ " for chunk_text_part in chunks:\n",
2861
+ " all_chunks_for_this_claim.append({\n",
2862
+ " \"text\": chunk_text_part,\n",
2863
+ " \"link\": doc['link']\n",
2864
+ " })\n",
2865
+ "\n",
2866
+ " evidence_by_claim[claim] = all_chunks_for_this_claim\n",
2867
+ " print(f\"==> Finish for claim '{claim}'. Total: {len(all_chunks_for_this_claim)} chunks.\")\n",
2868
+ "\n",
2869
+ " return evidence_by_claim"
2870
  ]
2871
  },
2872
  {
2873
  "cell_type": "code",
2874
+ "source": [
2875
+ "# Tải dữ liệu\n",
2876
+ "with open('document_retrieval_results.json', 'r', encoding='utf-8') as f:\n",
2877
+ " retrieved_data = json.load(f)\n",
2878
+ "\n",
2879
+ "evidence_by_claim = await process_claims_parallel(retrieved_data)\n",
2880
+ "\n",
2881
+ "# Lưu lại nếu cần\n",
2882
+ "# with open('evidence_chunks.json', 'w', encoding='utf-8') as f:\n",
2883
+ "# json.dump(evidence_by_claim, f, ensure_ascii=False, indent=2)"
2884
+ ],
2885
  "metadata": {
2886
  "colab": {
2887
  "base_uri": "https://localhost:8080/"
2888
  },
2889
+ "id": "hFOOzyCCTWEq",
2890
+ "outputId": "16b9e384-ac73-45c4-a64e-cf4e0c3a9059"
2891
  },
2892
+ "id": "hFOOzyCCTWEq",
2893
+ "execution_count": 18,
2894
  "outputs": [
2895
  {
 
2896
  "output_type": "stream",
2897
+ "name": "stdout",
2898
  "text": [
2899
  "\n",
2900
  "==================================================\n",
2901
  "Handle claim: 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '\n",
2902
+ " -> Crawling 12 links in parallel...\n",
2903
+ " -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
2904
+ " -> Detect web link. Handle by Trafilatura: https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
2905
+ " -> Detect web link. Handle by Trafilatura: https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
2906
+ " -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
2907
+ " -> Detect web link. Handle by Trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2908
+ " -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
2909
+ " -> Detect web link. Handle by Trafilatura: https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
2910
+ " -> Detect web link. Handle by Trafilatura: https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
2911
+ " -> Detect web link. Handle by Trafilatura: https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n",
2912
+ " -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
2913
+ " -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
2914
+ " -> Detect web link. Handle by Trafilatura: https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
2915
+ "⚠️ Error using Trafilatura for URL https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe: HTTP 403\n",
2916
+ "Falling back to Playwright extraction ...\n",
2917
+ "✅ SUCCESS! Extracted text using Trafilatura ---\n",
2918
+ "⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
2919
+ "Falling back to Playwright extraction ...\n",
2920
+ "⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
2921
+ "Falling back to Playwright extraction ...\n",
2922
+ "⚠️ Error using Trafilatura for URL https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false: Cannot connect to host moh.gov.vn:443 ssl:default [[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1010)]\n",
2923
+ "Falling back to Playwright extraction ...\n",
2924
+ " SUCCESS! Extracted text using Trafilatura ---\n",
2925
+ "SUCCESS! Extracted text using Trafilatura ---\n",
2926
+ "SUCCESS! Extracted text using Trafilatura ---\n",
2927
+ "✅ SUCCESS! Extracted text using Trafilatura ---\n",
2928
+ " SUCCESS! Extracted text using Trafilatura ---\n",
2929
+ " SUCCESS! Extracted text using Trafilatura ---\n",
2930
+ "SUCCESS! Extracted text using Trafilatura ---\n",
2931
+ "⚠️ Detected Cloudflare! Retrying with Stealth...\n",
2932
+ "✅ SUCCESS! Extracted text using Playwright ---\n",
2933
+ " SUCCESS! Extracted text using Playwright ---\n",
2934
+ " SUCCESS! Extracted text using Playwright ---\n",
2935
+ "SUCCESS! Extracted text using Playwright ---\n",
2936
+ "SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-cham-ngoi-cho-thoi-tiet-cuc-doan-the-nao-4739038.html\n",
2937
+ "SUCCESS!! https://special.nhandan.vn/biendoikhihauvahanhdongcuavietnam/index.html\n",
2938
+ "SUCCESS!! https://nhandan.vn/thich-ung-bien-doi-khi-hau-thuan-thien-ben-vung-post909799.html\n",
2939
+ "SUCCESS!! https://moh.gov.vn/tin-lien-quan/-/asset_publisher/vjYyM7O9aWnX/content/-anh-gia-thuc-trang-tac-ong-cua-bien-oi-khi-hau-voi-suc-khoe-tai-viet-nam?inheritRedirect=false\n",
2940
+ "SUCCESS!! https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2941
+ "SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-4796505.html\n",
2942
+ "SUCCESS!! https://vnexpress.net/bien-doi-khi-hau-khien-la-nina-co-yeu-to-di-thuong-4791345.html\n",
2943
+ "SUCCESS!! https://publichealth.santaclaracounty.gov/health-information/climate-and-health/khi-hau-va-suc-khoe\n",
2944
+ "SUCCESS!! https://nhandan.vn/hoi-chuong-bao-dong-ve-muc-do-nghiem-trong-cua-cuoc-khung-hoang-khi-hau-post893263.html\n"
2945
+ ]
2946
+ },
2947
+ {
2948
+ "output_type": "stream",
2949
+ "name": "stderr",
2950
+ "text": [
2951
+ "WARNING:pdfminer.pdfpage:The PDF <_io.BytesIO object at 0x780575b075b0> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case\n"
2952
+ ]
2953
+ },
2954
+ {
2955
+ "output_type": "stream",
2956
+ "name": "stdout",
2957
+ "text": [
2958
+ "SUCCESS!! https://moh.gov.vn/tin-tong-hop/-/asset_publisher/k206Q9qkZOqn/content/sot-xuat-huyet-tang-hon-15-who-canh-bao-benh-ngay-cang-kho-luong-do-bien-oi-khi-hau\n",
2959
+ "SUCCESS!! https://moh.gov.vn/tin-noi-bat/-/asset_publisher/3Yst7YhbkA5j/content/thu-truong-bo-y-te-bien-oi-khi-hau-lam-thay-oi-mo-hinh-lay-truyen-muc-o-cac-benh-truyen-nhiem\n",
2960
+ "SUCCESS!! https://nhandan.vn/thoi-tiet-cuc-doan-anh-huong-nghiem-trong-den-nen-kinh-te-anh-post712261.html\n",
2961
+ "==> Finish for claim 'Biến đổi khí hậu đang làm thời tiết cực đoan hơn. '. Total: 178 chunks.\n",
 
2962
  "\n",
2963
  "==================================================\n",
2964
  "Handle claim: 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'\n",
2965
+ " -> Crawling 10 links in parallel...\n",
2966
+ " -> Detect web link. Handle by Trafilatura: https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
2967
+ " -> Detect web link. Handle by Trafilatura: https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
2968
+ " -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
2969
+ " -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
2970
+ " -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
2971
+ " -> Detect PDF link. Handle by pdfminer.six: https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
2972
+ " -> Detect PDF link. Handle by pdfminer.six: https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
2973
+ " -> Detect web link. Handle by Trafilatura: http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
2974
+ " -> Detect web link. Handle by Trafilatura: https://loigiaihay.com/bai-tap-245461.html\n",
2975
+ " -> Detect web link. Handle by Trafilatura: https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
2976
+ " SUCCESS! Extracted text using pdfminer.six ---\n",
2977
+ " SUCCESS! Extracted text using pdfminer.six ---\n",
2978
+ "SUCCESS! Extracted text using pdfminer.six ---\n",
2979
+ "⚠️ Error using Trafilatura for URL https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75: Cannot connect to host moh.gov.vn:443 ssl:default [Connect call failed ('103.124.60.20', 443)]\n",
2980
+ "Falling back to Playwright extraction ...\n",
2981
+ "⚠️ Error using Trafilatura for URL https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc: Cannot connect to host moh.gov.vn:443 ssl:default [Connect call failed ('103.124.60.20', 443)]\n",
2982
+ "Falling back to Playwright extraction ...\n",
2983
+ "SUCCESS! Extracted text using pdfminer.six ---\n",
2984
+ "SUCCESS! Extracted text using Trafilatura ---\n",
2985
+ "✅ SUCCESS! Extracted text using Trafilatura ---\n",
2986
+ " Error using Playwright extraction for URL https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75: Page.goto: Download is starting\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2987
  "Call log:\n",
2988
+ " - navigating to \"https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\", waiting until \"domcontentloaded\"\n",
 
 
 
 
 
 
 
2989
  "\n",
2990
+ "⚠️ Error using Trafilatura for URL https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/: \n",
2991
+ "Falling back to Playwright extraction ...\n",
2992
+ "⚠️ Error using Trafilatura for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: \n",
2993
+ "Falling back to Playwright extraction ...\n",
2994
+ "✅ SUCCESS! Extracted text using Playwright ---\n",
2995
+ "✅ SUCCESS! Extracted text using Playwright ---\n",
2996
+ "❌ Error using Playwright extraction for URL http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html: Page.goto: Timeout 15000ms exceeded.\n",
2997
  "Call log:\n",
2998
+ " - navigating to \"http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\", waiting until \"domcontentloaded\"\n",
2999
  "\n",
3000
+ "SUCCESS!! https://nhandan.vn/special/biendoi_khihau_dedoa_toancau/index.html\n",
3001
+ "SUCCESS!! https://tiasang.com.vn/khoa-hoc-cong-nghe/bien-doi-khi-hau-nhanh-rong-manh-va-kho-luong-28426/\n",
3002
+ "SUCCESS!! https://documents1.worldbank.org/curated/en/099051625143037334/pdf/P176996-1f81a83a-aa8f-49d2-84ea-ec7d286593c5.pdf\n",
3003
+ "SUCCESS!! https://documents1.worldbank.org/curated/en/099152108232435513/pdf/IDU-00472f84-1adf-466c-9688-9d150a0879da.pdf\n",
3004
+ "SUCCESS!! https://moh.gov.vn/chuong-trinh-muc-tieu-quoc-gia/-/asset_publisher/7ng11fEWgASC/content/mot-so-khai-niem-ve-dinh-duong-thuc-pham-va-hoat-ong-the-luc\n",
3005
+ "SUCCESS!! https://www.bridgestone.com/responsibilities/social/procurement/pdf/Policy_Vietnamese.pdf\n",
3006
+ "SUCCESS!! https://documents1.worldbank.org/curated/en/099448304222426855/pdf/IDU15033e0a81a75f143501911d1dcc883a36364.pdf\n",
3007
+ "FAIL!! Using snippet for http://vnmha.gov.vn/tin-tuc-khcn-120/bien-doi-khi-hau--nhanh-rong-manh-va-kho-luong-10265.html\n",
3008
+ "SUCCESS!! https://loigiaihay.com/bai-tap-245461.html\n",
3009
+ "FAIL!! Using snippet for https://moh.gov.vn/documents/20182/212437/6512.%20Bao%20cao%20danh%20gia%20tac%20dong%20Luat%20ATTP.docx/2fb711aa-0f09-43a2-83af-523e512d8d75\n",
3010
+ "==> Finish for claim 'Nhiệt độ toàn cầu đã tăng 1.1 độ C trong 100 năm qua.'. Total: 1853 chunks.\n"
3011
  ]
3012
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3013
  ]
3014
  },
3015
  {
3016
  "cell_type": "code",
3017
+ "execution_count": null,
3018
  "id": "-CcEWKwlpLII",
3019
  "metadata": {
3020
  "id": "-CcEWKwlpLII"
 
3026
  },
3027
  {
3028
  "cell_type": "code",
3029
+ "execution_count": null,
3030
  "id": "F2Wl6CytHxXu",
3031
  "metadata": {
3032
  "colab": {
 
3442
  },
3443
  {
3444
  "cell_type": "code",
3445
+ "execution_count": null,
3446
  "id": "9J1Z1TzdOBfX",
3447
  "metadata": {
3448
  "id": "9J1Z1TzdOBfX"
 
3454
  },
3455
  {
3456
  "cell_type": "code",
3457
+ "execution_count": null,
3458
  "id": "qYo7yMI9H1Uc",
3459
  "metadata": {
3460
  "colab": {
 
3729
  },
3730
  {
3731
  "cell_type": "code",
3732
+ "execution_count": null,
3733
  "id": "AHMdGO0JOECE",
3734
  "metadata": {
3735
  "id": "AHMdGO0JOECE"
 
3742
  },
3743
  {
3744
  "cell_type": "code",
3745
+ "execution_count": null,
3746
  "id": "B3CSkIO6FqEz",
3747
  "metadata": {
3748
  "colab": {
 
3940
  },
3941
  {
3942
  "cell_type": "code",
3943
+ "execution_count": null,
3944
  "id": "kBYXeNpdIRdt",
3945
  "metadata": {
3946
  "colab": {
 
12253
  },
12254
  "nbformat": 4,
12255
  "nbformat_minor": 5
12256
+ }