JacobWP commited on
Commit
4ed530f
·
verified ·
1 Parent(s): c94925b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +337 -0
app.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon May 19 16:49:22 2025
5
+
6
+ @author: jacobwildt-persson
7
+ """
8
+
9
+ #!/usr/bin/env python3
10
+ # -*- coding: utf-8 -*-
11
+ # -----------------------------------------------
12
+ # Requirements & Setup Instructions
13
+ # -----------------------------------------------
14
+
15
+ # Python version:
16
+ # Requires Python 3.10 or later (tested on 3.12)
17
+
18
+
19
+ # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
20
+ # Recreate the environment with theese command in terminal
21
+ # conda env create -f environment.yml
22
+ # conda activate sprakenv
23
+ #
24
+
25
+ # Install all required packages:
26
+ # Run these commands in the terminal:
27
+
28
+ # pip install --upgrade gradio
29
+ # pip install pdfplumber
30
+ # pip install nltk
31
+ # pip install transformers
32
+ # pip install -U spacy
33
+
34
+ # Download language models:
35
+ # python -m spacy download es_core_news_lg
36
+ # python -m spacy download en_core_web_lg # if you add NER for English
37
+
38
+ # Check Gradio version used:
39
+ # import gradio as gr
40
+ # print(gr.__version__) # Gradio version 4.18.0
41
+
42
+ # 🔗 Reference: Gradio Quickstart Guide
43
+ # https://www.gradio.app/guides/quickstart
44
+ #Hugging Face
45
+ # https://huggingface.co/models
46
+
47
+ # Enghlish API model
48
+ # LanguageTool API: https://languagetool.org/http-api/swagger
49
+
50
+
51
+
52
+ #Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
53
+ # Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
54
+ # Recreate the environment with theese command in terminal
55
+ # conda env create -f environment.yml
56
+ # conda activate sprakenv
57
+ # python -m spacy download es_core_news_lg
58
+ #python -m nltk.downloader punkt wordnet
59
+ # -----------------------------------------------
60
+ """
61
+ Language learning app with Gradio UI, on & multiple users:
62
+ - Import text from file (.txt/.csv/.pdf) or manual text input
63
+ - Grammar correction via transformers (Spanish) or LanguageTool API (English)
64
+ - Analyze text (known/unknown words) per user & language
65
+ - Save unknown words as known
66
+ - Generate coherent practice sentence (Spanish & English)
67
+ - Log grammar corrections and practice sentence suggestions to CSV
68
+ """
69
+ import os
70
+ import datetime
71
+ import sqlite3
72
+ import requests
73
+ import random
74
+ import pandas as pd
75
+ import pdfplumber
76
+ import spacy
77
+ import csv
78
+ # SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
79
+ import sqlite3
80
+
81
+ from nltk.tokenize import word_tokenize
82
+ from nltk.stem import WordNetLemmatizer
83
+ from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
84
+ import gradio as gr
85
+ import gradio_client.utils as _gcu
86
+
87
+ # --- PATCH for Gradio utils schema bug ---
88
+ _orig_json = _gcu.json_schema_to_python_type
89
+ _orig_get = _gcu.get_type
90
+
91
+ def _patched_json_to_py(schema, defs=None):
92
+ if not isinstance(schema, dict):
93
+ return "any"
94
+ try:
95
+ return _orig_json(schema, defs)
96
+ except Exception:
97
+ return "any"
98
+
99
+ def _patched_get_type(schema):
100
+ if not isinstance(schema, dict):
101
+ return "any"
102
+ try:
103
+ return _orig_get(schema)
104
+ except Exception:
105
+ return "any"
106
+
107
+ _gcu.json_schema_to_python_type = _patched_json_to_py
108
+ _gcu.get_type = _patched_get_type
109
+
110
+ # --- SQLite Database initialization ---
111
+ DB_NAME = "vocabulary.db"
112
+ conn = sqlite3.connect(DB_NAME)
113
+ conn.execute("""
114
+ CREATE TABLE IF NOT EXISTS vocabulary (
115
+ user_id TEXT,
116
+ language TEXT,
117
+ word TEXT,
118
+ timestamp TEXT,
119
+ UNIQUE(user_id, language, word)
120
+ )
121
+ """)
122
+ conn.commit()
123
+ conn.close()
124
+
125
+ # --- Save word to database ---
126
+ def save_word_to_db(user_id: str, language: str, word: str):
127
+ ts = datetime.datetime.now().isoformat()
128
+ conn = sqlite3.connect(DB_NAME)
129
+ conn.execute(
130
+ "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
131
+ (user_id, language, word, ts)
132
+ )
133
+ conn.commit()
134
+ conn.close()
135
+
136
+ # --- Retrieve known words for user/language ---
137
+ def get_user_vocabulary(user_id: str, language: str) -> set[str]:
138
+ conn = sqlite3.connect(DB_NAME)
139
+ rows = conn.execute(
140
+ "SELECT word FROM vocabulary WHERE user_id=? AND language=?",
141
+ (user_id, language)
142
+ ).fetchall()
143
+ conn.close()
144
+ return {r[0] for r in rows}
145
+
146
+ # --- Load NLP models ---
147
+ nlp = spacy.load("es_core_news_lg")
148
+ tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
149
+ model = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
150
+ gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
151
+ gpt2_model_es = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
152
+ gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
153
+ gpt2_model_en = AutoModelForCausalLM.from_pretrained("gpt2")
154
+ lemmatizer = WordNetLemmatizer()
155
+
156
+ # ---Log to CSV (grammar corrections and sentence suggestions) ---
157
+ def log_to_csv(filename, row, fieldnames):
158
+ file_exists = os.path.isfile(filename)
159
+ with open(filename, "a", newline='', encoding="utf-8") as csvfile:
160
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
161
+ if not file_exists:
162
+ writer.writeheader()
163
+ writer.writerow(row)
164
+
165
+ # --- File Import ---
166
+ def import_file(path: str) -> str:
167
+ ext = os.path.splitext(path)[1].lower()
168
+ if ext == ".pdf":
169
+ pages = []
170
+ with pdfplumber.open(path) as pdf:
171
+ for p in pdf.pages:
172
+ pages.append(p.extract_text() or "")
173
+ return "\n".join(pages)
174
+ if ext == ".csv":
175
+ df = pd.read_csv(path)
176
+ if "text" in df:
177
+ return "\n".join(df["text"].astype(str))
178
+ raise ValueError("CSV saknar kolumnen 'text'.")
179
+ if ext == ".txt":
180
+ return open(path, encoding="utf-8").read()
181
+ raise ValueError(f"Okänt filformat: {ext}")
182
+
183
+ # --- Grammar Correction ---
184
+
185
+ def correct_grammar(text: str, language: str) -> str:
186
+ if language == "es":
187
+ corrected = []
188
+ for sent in nlp(text).sents:
189
+ s = sent.text.strip()
190
+ if not s: continue
191
+ inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
192
+ out = model.generate(
193
+ **inp,
194
+ max_new_tokens=inp.input_ids.shape[1],
195
+ num_beams=5,
196
+ early_stopping=True
197
+ )
198
+ corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
199
+ return " ".join(corrected)
200
+ # English: LanguageTool API
201
+ resp = requests.post(
202
+ "https://api.languagetool.org/v2/check",
203
+ data={"text": text, "language": language}
204
+ ).json()
205
+ for m in reversed(resp.get("matches", [])):
206
+ off, ln = m["offset"], m["length"]
207
+ repls = m.get("replacements", [])
208
+ val = repls[0]["value"] if repls else ""
209
+ text = text[:off] + val + text[off+ln:]
210
+ return text
211
+
212
+ # --- Analyze known and unknown words ---
213
+
214
+ def analyze_text(text: str, user_id: str, language: str):
215
+ toks = word_tokenize(text)
216
+ lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
217
+ vocab = get_user_vocabulary(user_id, language)
218
+ known = [w for w in lems if w in vocab]
219
+ unknown = [w for w in lems if w not in vocab]
220
+ return known, unknown
221
+ # --- Generate sentence using GPT2 based on unknown words ---
222
+ def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
223
+ kn, un = analyze_text(text, user_id, language)
224
+ if not un:
225
+ return "Inga okända ord att generera mening med."
226
+ chosen = random.sample(un, min(num_unknown, len(un)))
227
+ if language == "es":
228
+ prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
229
+ tokenizer = gpt2_tokenizer_es
230
+ model = gpt2_model_es
231
+ else:
232
+ prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
233
+ tokenizer = gpt2_tokenizer_en
234
+ model = gpt2_model_en
235
+ inp = tokenizer(prompt, return_tensors="pt", truncation=True)
236
+ outs = model.generate(
237
+ **inp,
238
+ max_new_tokens=50,
239
+ do_sample=True,
240
+ top_k=50,
241
+ top_p=0.95
242
+ )
243
+ gen = tokenizer.decode(outs[0], skip_special_tokens=True)
244
+ body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
245
+ sentence = (body.split(".")[0].strip() + ".") if "." in body else body
246
+ if not any(c.isalpha() for c in sentence):
247
+ return "Misslyckades att generera meningsfull övningsmening."
248
+ return sentence
249
+
250
+
251
+ # --- Gradio process callback ---
252
+ def process(user, language, txt, file, do_grammar, do_save):
253
+ try:
254
+ if txt and txt.strip():
255
+ text = txt.strip()
256
+ elif file:
257
+ text = import_file(file.name)
258
+ else:
259
+ return "", "", "", "Ingen text angiven.", ""
260
+ out = correct_grammar(text, language) if do_grammar else text
261
+ kn, un = analyze_text(out, user, language)
262
+ status = ""
263
+ if do_save and un:
264
+ for w in un:
265
+ save_word_to_db(user, language, w)
266
+ status = f"Sparade {len(un)} ord."
267
+ # Logga grammatikrättning till CSV
268
+ log_to_csv(
269
+ "grammarlog.csv",
270
+ {
271
+ "user": user, "language": language, "input": text,
272
+ "output": out, "timestamp": datetime.datetime.now().isoformat()
273
+ },
274
+ ["user", "language", "input", "output", "timestamp"]
275
+ )
276
+ return out, ", ".join(kn), ", ".join(un), status, ""
277
+ except Exception as e:
278
+ import traceback
279
+ tb = traceback.format_exc()
280
+ return "", "", "", f"FEL i process:\n{tb}", ""
281
+
282
+ # --- Sentence generation callback ---
283
+ def coherent_fn(user, language, txt, num):
284
+ try:
285
+ suggestion = generate_coherent_sentence(txt or "", user, language, num)
286
+ # Logga övningsförslag till CSV
287
+ log_to_csv(
288
+ "sentencelog.csv",
289
+ {
290
+ "user": user, "language": language, "input": txt,
291
+ "output": suggestion, "timestamp": datetime.datetime.now().isoformat()
292
+ },
293
+ ["user", "language", "input", "output", "timestamp"]
294
+ )
295
+ return suggestion
296
+ except Exception as e:
297
+ return f"Fel vid generering: {e}"
298
+
299
+ # --- Gradio UI ---
300
+ demo = gr.Blocks()
301
+ with demo:
302
+ gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
303
+ with gr.Row():
304
+ user_input = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
305
+ lang_dd = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
306
+ with gr.Column():
307
+ manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
308
+ file_input = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
309
+ grammar_cb = gr.Checkbox(label="Grammatik­rättning")
310
+ autosave_cb = gr.Checkbox(label="Spara okända ord")
311
+ run_btn = gr.Button("Kör analys & korrigering")
312
+ num_slider = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
313
+ coherent_btn = gr.Button("Koherent övningsmening")
314
+
315
+ corr_out = gr.Textbox(label="Korrigerad text", lines=4)
316
+ known_out = gr.Textbox(label="Kända ord")
317
+ unknown_out = gr.Textbox(label="Okända ord")
318
+ status_out = gr.Textbox(label="Status")
319
+ coherent_out = gr.Textbox(label="Koherent övningsmening")
320
+
321
+ # --- Knapparnas click‐kopplingar ---
322
+ run_btn.click(
323
+ fn=process,
324
+ inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
325
+ outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
326
+ )
327
+ coherent_btn.click(
328
+ fn=coherent_fn,
329
+ inputs=[user_input, lang_dd, manual_input, num_slider],
330
+ outputs=[coherent_out]
331
+ )
332
+ #Make sure to change language for the textfile to be analyzed in its target language
333
+
334
+ # --- Start app ---
335
+ if __name__ == "__main__":
336
+ url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
337
+ print("Appen körs på:", url)