amosnbn commited on
Commit
9740d1e
·
1 Parent(s): 7a604fd
Files changed (1) hide show
  1. app.py +61 -36
app.py CHANGED
@@ -1,8 +1,9 @@
1
  # app.py
2
  # PapuaTranslate — Flask + SQLAlchemy (Supabase/SQLite) + mT5/LoRA (lazy)
3
- # Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback, strip BOM JSON.
 
4
 
5
- import os, re, json, codecs, pathlib, logging, threading, traceback
6
  from datetime import datetime, timezone, timedelta
7
  from functools import wraps
8
  from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
@@ -15,16 +16,12 @@ log = logging.getLogger("papua-app")
15
 
16
  # ========== Flask ==========
17
  app = Flask(__name__, template_folder="frontend", static_folder="static")
18
-
19
- # Trust reverse proxy di HF supaya proto/host benar (perlu untuk cookie Secure)
20
  app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
21
-
22
- # Cookie/session aman untuk iframe (HF Spaces) dan tab langsung hf.space
23
  app.config.update(
24
  SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
25
  SESSION_COOKIE_NAME="hfspace_session",
26
- SESSION_COOKIE_SAMESITE="None", # penting: iframe = third-party context
27
- SESSION_COOKIE_SECURE=True, # wajib True jika SAMESITE=None
28
  SESSION_COOKIE_HTTPONLY=True,
29
  SESSION_COOKIE_PATH="/",
30
  PREFERRED_URL_SCHEME="https",
@@ -44,12 +41,10 @@ if not DATABASE_URL:
44
  DATABASE_URL = "sqlite:////tmp/app.db"
45
  log.warning("[DB] DATABASE_URL tidak diset; pakai SQLite /tmp/app.db")
46
  else:
47
- # normalisasi skema ke psycopg2
48
  if DATABASE_URL.startswith("postgres://"):
49
  DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
50
  elif DATABASE_URL.startswith("postgresql://"):
51
  DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
52
- # tambahkan sslmode kalau belum ada
53
  if DATABASE_URL.startswith("postgresql+psycopg2") and "sslmode=" not in DATABASE_URL:
54
  sep = "&" if "?" in DATABASE_URL else "?"
55
  DATABASE_URL = f"{DATABASE_URL}{sep}sslmode=require"
@@ -95,7 +90,7 @@ def login_required(fn):
95
  return fn(*args, **kwargs)
96
  return _wrap
97
 
98
- # ========== Prenorm (heuristik ringan) ==========
99
  PAPUA_MAP = {
100
  r"\bsa\b": "saya", r"\bko\b": "kamu", r"\btra\b": "tidak", r"\bndak\b": "tidak",
101
  r"\bmo\b": "mau", r"\bpu\b": "punya", r"\bsu\b": "sudah", r"\bkong\b": "kemudian",
@@ -106,9 +101,9 @@ def prenorm(text: str) -> str:
106
  for pat, repl in PAPUA_MAP.items(): t = re.sub(pat, repl, t, flags=re.IGNORECASE)
107
  return t
108
 
109
- # ========== Model (lazy) + Strip BOM ==========
110
- BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "google/mt5-small") # mulai kecil dulu untuk uji cepat
111
- ADAPTER_ID = os.getenv("ADAPTER_ID", "") # kosongkan dulu; isi nanti
112
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
113
 
114
  TOK = None
@@ -118,7 +113,6 @@ _MODEL_READY = False
118
  _MODEL_ERROR = None
119
 
120
  def _strip_bom_in_dir(root_dir: str):
121
- """Hapus BOM dari semua *.json (UTF-8-sig) agar tidak memicu JSONDecodeError."""
122
  root = pathlib.Path(root_dir)
123
  for p in root.rglob("*.json"):
124
  try:
@@ -128,11 +122,56 @@ def _strip_bom_in_dir(root_dir: str):
128
  json.dump(data, f, ensure_ascii=False, indent=2)
129
  log.info(f"[BOM] stripped: {p}")
130
  except Exception as e:
131
- # Jangan gagal hanya karena satu file
132
  log.warning(f"[BOM] skip {p}: {e}")
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def _load_model():
135
- """Download ke /tmp, strip BOM, lalu load dari path lokal."""
136
  global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
137
  try:
138
  log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
@@ -154,8 +193,8 @@ def _load_model():
154
  allow_patterns=None,
155
  )
156
  _strip_bom_in_dir(adapter_dir)
 
157
 
158
- # Import di sini agar error versi lib ketangkep di /diag
159
  import torch
160
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
161
  from peft import PeftModel
@@ -194,12 +233,8 @@ def translate_with_model(text: str, max_new_tokens: int = 48) -> str:
194
  tok, m = get_model()
195
  if not _MODEL_READY or m is None:
196
  raise RuntimeError(f"Model not ready: {_MODEL_ERROR or 'unknown error'}")
197
-
198
- # Guard panjang input supaya aman
199
  enc = tok([text], return_tensors="pt", truncation=True, max_length=256)
200
  enc = {k: v.to(DEVICE) for k, v in enc.items()}
201
-
202
- log.info("[GEN] start (tok_len=%d)", int(enc["input_ids"].shape[-1]))
203
  out = m.generate(
204
  **enc,
205
  max_new_tokens=int(max_new_tokens),
@@ -208,10 +243,8 @@ def translate_with_model(text: str, max_new_tokens: int = 48) -> str:
208
  no_repeat_ngram_size=3,
209
  early_stopping=True,
210
  )
211
- log.info("[GEN] done")
212
  return tok.decode(out[0], skip_special_tokens=True)
213
 
214
- # Preload supaya request pertama nggak lama/timeout
215
  def _preload_thread():
216
  try:
217
  _load_model()
@@ -239,8 +272,8 @@ def diag():
239
  try:
240
  import torch, transformers, peft
241
  torch_v = torch.__version__
242
- tf_v = transformers.__version__
243
- peft_v = peft.__version__
244
  except Exception as e:
245
  torch_v = tf_v = peft_v = f"import error: {e}"
246
 
@@ -298,7 +331,7 @@ def register_get():
298
  @app.post("/register")
299
  def register_post():
300
  email = (request.form.get("email") or "").strip().lower()
301
- pwd = request.form.get("password") or ""
302
  if not email or not pwd:
303
  flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
304
  with SessionLocal() as s:
@@ -344,18 +377,10 @@ def api_translate():
344
 
345
  try:
346
  clean = prenorm(text)
347
-
348
- if FALLBACK_TRANSLATE:
349
- mt = f"[FAKE] {clean}"
350
- else:
351
- mt = translate_with_model(clean, max_new_tokens=max_new)
352
-
353
- # Simpan riwayat
354
  with SessionLocal() as s:
355
  s.add(Translation(user_id=session["uid"], src=text, mt=mt))
356
  s.commit()
357
-
358
- # Kompatibel dengan frontend kamu (pakai j.mt)
359
  return jsonify({"ok": True, "mt": mt})
360
  except Exception as e:
361
  log.error("[API] translate error: %s", e)
 
1
  # app.py
2
  # PapuaTranslate — Flask + SQLAlchemy (Supabase/SQLite) + mT5/LoRA (lazy)
3
+ # Fitur: ProxyFix, cookie Spaces, session permanent, /diag, preload, fallback,
4
+ # strip BOM JSON, sanitize adapter_config untuk kompatibilitas PEFT lama.
5
 
6
+ import os, re, json, codecs, pathlib, logging, threading, traceback, inspect
7
  from datetime import datetime, timezone, timedelta
8
  from functools import wraps
9
  from flask import Flask, render_template, request, redirect, url_for, session, jsonify, flash
 
16
 
17
  # ========== Flask ==========
18
  app = Flask(__name__, template_folder="frontend", static_folder="static")
 
 
19
  app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1)
 
 
20
  app.config.update(
21
  SECRET_KEY=os.getenv("SECRET_KEY", "dev-secret-change-me"),
22
  SESSION_COOKIE_NAME="hfspace_session",
23
+ SESSION_COOKIE_SAMESITE="None",
24
+ SESSION_COOKIE_SECURE=True,
25
  SESSION_COOKIE_HTTPONLY=True,
26
  SESSION_COOKIE_PATH="/",
27
  PREFERRED_URL_SCHEME="https",
 
41
  DATABASE_URL = "sqlite:////tmp/app.db"
42
  log.warning("[DB] DATABASE_URL tidak diset; pakai SQLite /tmp/app.db")
43
  else:
 
44
  if DATABASE_URL.startswith("postgres://"):
45
  DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
46
  elif DATABASE_URL.startswith("postgresql://"):
47
  DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
 
48
  if DATABASE_URL.startswith("postgresql+psycopg2") and "sslmode=" not in DATABASE_URL:
49
  sep = "&" if "?" in DATABASE_URL else "?"
50
  DATABASE_URL = f"{DATABASE_URL}{sep}sslmode=require"
 
90
  return fn(*args, **kwargs)
91
  return _wrap
92
 
93
+ # ========== Prenorm ==========
94
  PAPUA_MAP = {
95
  r"\bsa\b": "saya", r"\bko\b": "kamu", r"\btra\b": "tidak", r"\bndak\b": "tidak",
96
  r"\bmo\b": "mau", r"\bpu\b": "punya", r"\bsu\b": "sudah", r"\bkong\b": "kemudian",
 
101
  for pat, repl in PAPUA_MAP.items(): t = re.sub(pat, repl, t, flags=re.IGNORECASE)
102
  return t
103
 
104
+ # ========== Model (lazy) + Strip BOM + Sanitize adapter_config ==========
105
+ BASE_MODEL_ID = os.getenv("BASE_MODEL_ID", "google/mt5-small")
106
+ ADAPTER_ID = os.getenv("ADAPTER_ID", "")
107
  DEVICE = "cuda" if os.getenv("DEVICE", "cpu") == "cuda" else "cpu"
108
 
109
  TOK = None
 
113
  _MODEL_ERROR = None
114
 
115
  def _strip_bom_in_dir(root_dir: str):
 
116
  root = pathlib.Path(root_dir)
117
  for p in root.rglob("*.json"):
118
  try:
 
122
  json.dump(data, f, ensure_ascii=False, indent=2)
123
  log.info(f"[BOM] stripped: {p}")
124
  except Exception as e:
 
125
  log.warning(f"[BOM] skip {p}: {e}")
126
 
127
+ def _sanitize_adapter_config(adapter_dir: str):
128
+ """
129
+ Buang kunci tidak dikenal oleh peft.LoraConfig saat ini.
130
+ Menangani error seperti: LoraConfig.__init__() got an unexpected keyword argument 'e_rank_pattern'
131
+ """
132
+ try:
133
+ from peft import LoraConfig # versi yang sedang terpasang di runtime
134
+ except Exception as e:
135
+ log.warning(f"[SAN] Tidak bisa import LoraConfig: {e}")
136
+ return
137
+
138
+ sig = None
139
+ try:
140
+ sig = inspect.signature(LoraConfig.__init__)
141
+ allowed = set(p.name for p in sig.parameters.values())
142
+ except Exception as e:
143
+ log.warning(f"[SAN] Tidak bisa baca signature LoraConfig: {e}")
144
+ return
145
+
146
+ cfg_path = pathlib.Path(adapter_dir) / "adapter_config.json"
147
+ if not cfg_path.exists():
148
+ # beberapa repo pakai nama lain; normalnya adapter_config.json
149
+ for alt in ("adapter_config.json", "adapter_config_0.json", "config.json"):
150
+ candidate = pathlib.Path(adapter_dir) / alt
151
+ if candidate.exists():
152
+ cfg_path = candidate
153
+ break
154
+ if not cfg_path.exists():
155
+ log.warning(f"[SAN] adapter_config tidak ditemukan di {adapter_dir}")
156
+ return
157
+
158
+ try:
159
+ with codecs.open(cfg_path, "r", encoding="utf-8-sig") as f:
160
+ cfg = json.load(f)
161
+ except Exception as e:
162
+ log.warning(f"[SAN] gagal baca adapter_config: {e}")
163
+ return
164
+
165
+ # filter hanya key yang diizinkan oleh LoraConfig saat ini
166
+ cleaned = {k: v for k, v in cfg.items() if k in allowed}
167
+ dropped = [k for k in cfg.keys() if k not in allowed]
168
+ if dropped:
169
+ log.info(f"[SAN] drop fields tidak dikenal: {dropped}")
170
+ with open(cfg_path, "w", encoding="utf-8") as f:
171
+ json.dump(cleaned, f, ensure_ascii=False, indent=2)
172
+
173
  def _load_model():
174
+ """Download ke /tmp, strip BOM, sanitize adapter_config, lalu load dari path lokal."""
175
  global TOK, MODEL, _MODEL_READY, _MODEL_ERROR
176
  try:
177
  log.info("[MODEL] downloading base=%s adapter=%s", BASE_MODEL_ID, ADAPTER_ID or "-")
 
193
  allow_patterns=None,
194
  )
195
  _strip_bom_in_dir(adapter_dir)
196
+ _sanitize_adapter_config(adapter_dir)
197
 
 
198
  import torch
199
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
200
  from peft import PeftModel
 
233
  tok, m = get_model()
234
  if not _MODEL_READY or m is None:
235
  raise RuntimeError(f"Model not ready: {_MODEL_ERROR or 'unknown error'}")
 
 
236
  enc = tok([text], return_tensors="pt", truncation=True, max_length=256)
237
  enc = {k: v.to(DEVICE) for k, v in enc.items()}
 
 
238
  out = m.generate(
239
  **enc,
240
  max_new_tokens=int(max_new_tokens),
 
243
  no_repeat_ngram_size=3,
244
  early_stopping=True,
245
  )
 
246
  return tok.decode(out[0], skip_special_tokens=True)
247
 
 
248
  def _preload_thread():
249
  try:
250
  _load_model()
 
272
  try:
273
  import torch, transformers, peft
274
  torch_v = torch.__version__
275
+ tf_v = transformers.__version__
276
+ peft_v = peft.__version__
277
  except Exception as e:
278
  torch_v = tf_v = peft_v = f"import error: {e}"
279
 
 
331
  @app.post("/register")
332
  def register_post():
333
  email = (request.form.get("email") or "").strip().lower()
334
+ pwd = (request.form.get("password") or "")
335
  if not email or not pwd:
336
  flash("Isi email dan password", "error"); return redirect(url_for("register_get"))
337
  with SessionLocal() as s:
 
377
 
378
  try:
379
  clean = prenorm(text)
380
+ mt = f"[FAKE] {clean}" if FALLBACK_TRANSLATE else translate_with_model(clean, max_new_tokens=max_new)
 
 
 
 
 
 
381
  with SessionLocal() as s:
382
  s.add(Translation(user_id=session["uid"], src=text, mt=mt))
383
  s.commit()
 
 
384
  return jsonify({"ok": True, "mt": mt})
385
  except Exception as e:
386
  log.error("[API] translate error: %s", e)