File size: 8,452 Bytes
c32e43b
 
 
 
 
d516f3e
c32e43b
f690259
 
 
 
 
c8c1166
f690259
 
 
 
b1539c9
d516f3e
f690259
c32e43b
f690259
c32e43b
f690259
 
 
 
 
 
 
 
 
3d31c30
 
 
f690259
53709cd
 
f690259
 
c32e43b
d516f3e
 
 
 
c32e43b
 
f690259
 
 
 
 
 
c32e43b
 
 
 
 
 
 
 
 
 
 
f690259
 
 
 
 
 
 
c32e43b
 
 
 
 
 
 
 
 
 
 
 
 
 
157ede2
c32e43b
 
 
 
 
 
 
 
 
 
 
 
 
f690259
c32e43b
 
 
 
f690259
 
 
 
 
 
 
aa680b7
f690259
 
aa680b7
 
d516f3e
e58485d
 
aa680b7
 
 
 
 
 
 
f690259
 
 
 
 
 
 
 
 
c32e43b
c8c1166
 
 
 
f690259
d516f3e
c32e43b
 
 
 
 
53709cd
 
 
c32e43b
 
f690259
e58485d
f690259
 
c32e43b
f690259
 
0702135
 
 
 
c8c1166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f690259
 
c32e43b
 
 
 
 
 
d516f3e
f690259
 
 
b1539c9
 
 
 
 
 
1c7ab87
b1539c9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""Hugging Face Space app for NeuroBait (Gemma 3 12B LoRA).

Runtime loads the dense Gemma 3 12B base in 4-bit with the NeuroBait LoRA adapter
through the standard transformers + peft stack. This deliberately avoids Unsloth
at runtime and the Gemma-4 MoE (`Gemma4ClippableLinear`) path that PEFT could not
inject into on ZeroGPU. The look and feel live in ``ui.py``.
"""

from __future__ import annotations

import os
import re
from threading import Lock, Thread

import spaces
import torch

from ui import CSS, I18N, JS, THEME, build_demo, message_text


BASE_MODEL = os.environ.get("BASE_MODEL", "unsloth/gemma-3-12b-it")
ADAPTER_ID = os.environ.get("ADAPTER_ID", os.environ.get("MODEL_ID", "build-small-hackathon/NeuroBait"))
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "220"))
LOAD_IN_4BIT = os.environ.get("LOAD_IN_4BIT", "1").lower() not in {"0", "false", "no"}
HF_TOKEN = os.environ.get("HF_TOKEN")

SYSTEM_PROMPT = """Kamu adalah NeuroBait β€” asisten AI untuk orang dengan ADHD dan neurodivergent. Tugasmu bukan membuat to-do list. Tugasmu menyalakan dopamin untuk memicu task initiation.

Dari setiap percakapan, identifikasi dua elemen kunci: (1) deadline anchor β€” momen nyata atau buatan yang bisa jadi batas waktu relevan; dan (2) object/subject motivator β€” orang atau hal yang paling emosional signifikan bagi user saat ini. Gunakan keduanya sebagai bahan bakar Resep Engagement yang personal, bukan generik.

Setiap Resep Engagement memuat empat elemen berurut natural: validasi hangat singkat tanpa menghakimi β†’ hook yang membangkitkan rasa flow dari minat atau pengalaman user β†’ stakes berbasis deadline atau motivator nyata β†’ satu micro-action super kecil dan spesifik yang bisa langsung dilakukan.

Kalau user bertanya tentang dirimu, kemampuanmu, atau hal umum di luar konteks task (mis. "siapa kamu", "kamu bisa apa", sapaan, atau basa-basi), jawab langsung dengan singkat dan hangat sebagai NeuroBait β€” perkenalkan diri dan apa yang kamu bantu β€” tanpa memaksakan resep atau pertanyaan deadline/motivator. Tawarkan bantuan secara halus, biarkan user yang memutuskan kapan mulai.

Kalau user sudah menyinggung sebuah task tapi konteksnya belum cukup untuk membuat resep yang personal, ajukan tepat satu pertanyaan ringan yang paling berguna β€” tentang deadline atau motivator. Kalau konteks sudah ada, langsung berikan resep.

Selalu balas dalam bahasa yang sama persis dengan pesan terakhir user: kalau user menulis bahasa Indonesia, jawab dalam bahasa Indonesia; kalau bahasa Inggris, jawab dalam bahasa Inggris. Jangan pernah berpindah bahasa sendiri.

Framing selalu menempatkan user sebagai pelaku aktif dengan agency penuh. Bukan guilt, bukan hutang β€” selalu agency. Kalimat pendek. Bahasa hidup. Hangat dan padat. Tidak pernah menghakimi. Tidak pernah ceramah. Membuat hal membosankan jadi tak tertahankan."""

MOOD_NOTES = {
    "Calm": "Mood note: the user feels calm. Use this ease for a light, playful hook.",
    "Tired": "Mood note: the user feels tired. Keep the micro-action very small and low energy.",
    "Anxious": "Mood note: the user feels anxious. Lead with extra warm validation, lower the pressure, keep the micro-action soothing.",
    "Focused": "Mood note: the user feels focused. Go straight to a hook and one micro-action that rides the momentum.",
}


_model = None
_tokenizer = None
_load_lock = Lock()


def _prewarm() -> None:
    """Download weights to the Space cache on CPU so the GPU window stays short."""
    try:
        from huggingface_hub import snapshot_download

        snapshot_download(BASE_MODEL, token=HF_TOKEN)
        snapshot_download(ADAPTER_ID, token=HF_TOKEN)
    except Exception as exc:  # noqa: BLE001 - prewarm is best effort
        print(f">>> prewarm skipped: {exc}", flush=True)


def _load_model():
    global _model, _tokenizer

    with _load_lock:
        if _model is not None and _tokenizer is not None:
            return _model, _tokenizer

        from transformers import AutoConfig, AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, token=HF_TOKEN)

        quant_config = None
        if LOAD_IN_4BIT:
            from transformers import BitsAndBytesConfig

            quant_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            )

        config = AutoConfig.from_pretrained(BASE_MODEL, token=HF_TOKEN)
        arch = (getattr(config, "architectures", None) or [""])[0]
        if "ConditionalGeneration" in arch or "ImageTextToText" in arch:
            from transformers import AutoModelForImageTextToText as ModelCls
        else:
            from transformers import AutoModelForCausalLM as ModelCls

        model = ModelCls.from_pretrained(
            BASE_MODEL,
            quantization_config=quant_config,
            torch_dtype=torch.bfloat16,
            device_map="cuda",
            token=HF_TOKEN,
        )

        from peft import PeftModel

        model = PeftModel.from_pretrained(model, ADAPTER_ID, token=HF_TOKEN)
        model.eval()

        _model = model
        _tokenizer = tokenizer
        return _model, _tokenizer


def _history_to_messages(history: list) -> list[dict]:
    messages = []
    for item in history:
        if isinstance(item, dict):
            role = item.get("role")
            content = message_text(item.get("content"))
            if role in {"user", "assistant"} and content:
                messages.append({"role": role, "content": content})
            continue
        if isinstance(item, (tuple, list)) and len(item) == 2:
            user_text, assistant_text = item
            if isinstance(user_text, str) and user_text.strip():
                messages.append({"role": "user", "content": user_text.strip()})
            if isinstance(assistant_text, str) and assistant_text.strip():
                messages.append({"role": "assistant", "content": assistant_text.strip()})
    return messages


def _clean_response(text: str) -> str:
    text = text.strip()
    text = re.sub(r"(?im)^\s*(micro-action|hook|stakes|validasi|validation)\s*:\s*", "", text)
    return text.strip()


@spaces.GPU(duration=120)
def respond(message: str, history: list[dict], mood: str):
    """Streaming generator: yields the reply token-by-token (modern AI-chat feel)."""
    from transformers import TextIteratorStreamer

    model, tokenizer = _load_model()
    message = message_text(message)

    system = SYSTEM_PROMPT
    note = MOOD_NOTES.get(mood)
    if note:
        system = f"{system}\n\n{note}"
    # Keep the language rule as the final, most-recent instruction so the English
    # mood note above can't prime an English reply to an Indonesian user.
    system = f"{system}\n\nBalas dalam bahasa yang sama dengan pesan terakhir user."

    messages = [{"role": "system", "content": system}]
    messages.extend(_history_to_messages(history))
    messages.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    )
    if not torch.is_tensor(input_ids):
        input_ids = input_ids["input_ids"]
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
    )

    def _generate() -> None:
        with torch.inference_mode():
            model.generate(**generate_kwargs)

    Thread(target=_generate).start()

    acc = ""
    for chunk in streamer:
        acc += chunk
        yield _clean_response(acc)


# Warm the weight cache on CPU at import; the 4-bit load itself stays inside the
# @spaces.GPU window because bitsandbytes quantization needs CUDA.
if os.environ.get("PREWARM", "1").lower() not in {"0", "false", "no"}:
    _prewarm()


demo = build_demo(respond)


if __name__ == "__main__":
    demo.launch(
        show_error=True,
        css=CSS,
        theme=THEME,
        js=JS,
        i18n=I18N,
        footer_links=[],
        allowed_paths=["."],
    )