"""
ChatBIA Hugging Face Spaces API
24/7 회계 AI 서버
"""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from typing import Optional, AsyncGenerator
import os
import json
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

app = FastAPI(
    title="ChatBIA API",
    description="회계 전문 AI 서버 (Hugging Face Spaces)",
    version="1.0.0"
)

# CORS 설정
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Hugging Face 모델 레포지터리
GENERAL_MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF"
GENERAL_MODEL_FILE = "qwen2.5-3b-instruct-q4_k_m.gguf"
BSL_MODEL_REPO = "Seounghyup/ChatBIA-3B-v0.1"
BSL_MODEL_FILE = "ChatBIA-3B-v0.1-Q4_K_M.gguf"

# 전역 모델 변수
general_model = None
bsl_model = None
general_model_path = None
bsl_model_path = None


class ChatRequest(BaseModel):
    message: str
    mode: str = "bsl"  # "general" or "bsl"
    max_tokens: int = 1024
    temperature: float = 0.7


class ChatResponse(BaseModel):
    response: str
    mode: str
    tokens: int


@app.on_event("startup")
async def load_models():
    """서버 시작 시 모델 다운로드 및 로드"""
    global general_model, bsl_model, general_model_path, bsl_model_path

    # General 모델 다운로드
    print(f"🔄 일반 모드 모델 다운로드 중: {GENERAL_MODEL_REPO}/{GENERAL_MODEL_FILE}")
    try:
        general_model_path = hf_hub_download(
            repo_id=GENERAL_MODEL_REPO,
            filename=GENERAL_MODEL_FILE,
            repo_type="model"
        )
        print(f"✅ 일반 모드 모델 다운로드 완료: {general_model_path}")

        # 모델 로드
        general_model = Llama(
            model_path=general_model_path,
            n_ctx=2048,
            n_threads=2,  # Spaces CPU 제한
            n_gpu_layers=0,
            verbose=False
        )
        print("✅ 일반 모드 모델 로드 완료")
    except Exception as e:
        print(f"❌ 일반 모드 모델 로드 실패: {e}")

    # BSL 모델 다운로드
    print(f"🔄 BSL 모드 모델 다운로드 중: {BSL_MODEL_REPO}/{BSL_MODEL_FILE}")
    try:
        bsl_model_path = hf_hub_download(
            repo_id=BSL_MODEL_REPO,
            filename=BSL_MODEL_FILE,
            repo_type="model"
        )
        print(f"✅ BSL 모드 모델 다운로드 완료: {bsl_model_path}")

        # 모델 로드
        bsl_model = Llama(
            model_path=bsl_model_path,
            n_ctx=2048,
            n_threads=2,
            n_gpu_layers=0,
            verbose=False
        )
        print("✅ BSL 모드 모델 로드 완료")
    except Exception as e:
        print(f"❌ BSL 모델 로드 실패: {e}")


def build_prompt(message: str, mode: str) -> str:
    """프롬프트 빌드"""
    if mode == "bsl":
        return f"""<|im_start|>system
You are a professional accounting AI assistant. Respond naturally in Korean.

Important: Only generate BSL DSL code when the user explicitly requests calculations (e.g., "계산해줘", "코드 작성해줘", "BSL로 작성해줘"). For general questions or greetings, respond conversationally without code.<|im_end|>
<|im_start|>user
{message}<|im_end|>
<|im_start|>assistant
"""
    else:
        return f"""<|im_start|>system
You are a helpful AI assistant. Respond naturally in Korean.<|im_end|>
<|im_start|>user
{message}<|im_end|>
<|im_start|>assistant
"""


@app.get("/")
async def root():
    """헬스 체크"""
    return {
        "status": "online",
        "service": "ChatBIA API",
        "version": "1.0.0",
        "platform": "Hugging Face Spaces",
        "models": {
            "general": general_model is not None,
            "bsl": bsl_model is not None
        }
    }


@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    """채팅 엔드포인트"""
    # 모델 선택
    if request.mode == "general":
        model = general_model
        model_name = "General"
    else:
        model = bsl_model
        model_name = "BSL"

    # 모델이 없으면 에러
    if model is None:
        raise HTTPException(
            status_code=503,
            detail=f"{model_name} 모델이 로드되지 않았습니다."
        )

    try:
        # 프롬프트 빌드
        prompt = build_prompt(request.message, request.mode)

        # 추론
        response = model(
            prompt,
            max_tokens=request.max_tokens,
            temperature=request.temperature,
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1,
            stop=["<|im_end|>", "###", "\n\n\n"]
        )

        text = response["choices"][0]["text"].strip()
        tokens = len(response["choices"][0]["text"].split())

        return ChatResponse(
            response=text,
            mode=request.mode,
            tokens=tokens
        )

    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"AI 모델 처리 중 오류: {str(e)}"
        )


@app.get("/models")
async def get_models():
    """사용 가능한 모델 목록"""
    return {
        "general": {
            "loaded": general_model is not None,
            "path": general_model_path
        },
        "bsl": {
            "loaded": bsl_model is not None,
            "path": bsl_model_path
        }
    }


@app.post("/chat/stream")
async def chat_stream(request: ChatRequest):
    """스트리밍 채팅 엔드포인트 (안드로이드/타임아웃 방지)"""
    # 모델 선택
    if request.mode == "general":
        model = general_model
        model_name = "General"
    else:
        model = bsl_model
        model_name = "BSL"

    # 모델이 없으면 에러
    if model is None:
        raise HTTPException(
            status_code=503,
            detail=f"{model_name} 모델이 로드되지 않았습니다."
        )

    async def generate_stream() -> AsyncGenerator[str, None]:
        """토큰 단위 스트리밍 제너레이터"""
        import asyncio

        try:
            # 프롬프트 빌드
            prompt = build_prompt(request.message, request.mode)

            # 스트리밍 추론
            stream = model(
                prompt,
                max_tokens=request.max_tokens,
                temperature=request.temperature,
                top_p=0.9,
                top_k=40,
                repeat_penalty=1.1,
                stop=["<|im_end|>", "###", "\n\n\n"],
                stream=True  # 스트리밍 활성화
            )

            token_count = 0
            for chunk in stream:
                if "choices" in chunk and len(chunk["choices"]) > 0:
                    delta = chunk["choices"][0].get("text", "")
                    if delta:
                        token_count += 1
                        # SSE 형식: data: {json}\n\n
                        data = {
                            "token": delta,
                            "done": False,
                            "token_count": token_count
                        }
                        yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
                        # 즉시 전송을 위한 짧은 대기
                        await asyncio.sleep(0)

            # 완료 신호
            final_data = {
                "token": "",
                "done": True,
                "token_count": token_count,
                "mode": request.mode
            }
            yield f"data: {json.dumps(final_data, ensure_ascii=False)}\n\n"

        except Exception as e:
            error_data = {
                "error": str(e),
                "done": True
            }
            yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n"

    return StreamingResponse(
        generate_stream(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no"  # Nginx 버퍼링 비활성화
        }
    )