Spaces:

JacobWP
/

LanguageLearningSpace

Runtime error

App Files Files Community

JacobWP commited on Jul 7, 2025

Commit

4ed530f

verified ·

1 Parent(s): c94925b

Upload app.py

Browse files

Files changed (1) hide show

app.py +337 -0

app.py ADDED Viewed

	@@ -0,0 +1,337 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon May 19 16:49:22 2025
+@author: jacobwildt-persson
+"""
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# -----------------------------------------------
+# Requirements & Setup Instructions
+# -----------------------------------------------
+# Python version:
+# Requires Python 3.10 or later (tested on 3.12)
+# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
+# Recreate the environment with theese command in terminal
+# conda env create -f environment.yml
+# conda activate sprakenv
+#
+# Install all required packages:
+# Run these commands in the terminal:
+# pip install --upgrade gradio
+# pip install pdfplumber
+# pip install nltk
+# pip install transformers
+# pip install -U spacy
+# Download language models:
+# python -m spacy download es_core_news_lg
+# python -m spacy download en_core_web_lg  # if you add NER for English
+# Check Gradio version used:
+# import gradio as gr
+# print(gr.__version__)  # Gradio version 4.18.0
+# 🔗 Reference: Gradio Quickstart Guide
+# https://www.gradio.app/guides/quickstart
+#Hugging Face
+ # https://huggingface.co/models
+# Enghlish API model
+# LanguageTool API: https://languagetool.org/http-api/swagger
+#Rembember !!!!!!!!!!!!!!!!!!!!!!!!!
+# Run your script inside a virtual environment (e.g. conda or venv) to avoid conflicts.
+# Recreate the environment with theese command in terminal
+# conda env create -f environment.yml
+# conda activate sprakenv
+# python -m spacy download es_core_news_lg
+#python -m nltk.downloader punkt wordnet
+# -----------------------------------------------
+"""
+Language learning app with Gradio UI, on & multiple users:
+- Import text from file (.txt/.csv/.pdf) or manual text input
+- Grammar correction via transformers (Spanish) or LanguageTool API (English)
+- Analyze text (known/unknown words) per user & language
+- Save unknown words as known
+- Generate coherent practice sentence (Spanish & English)
+- Log grammar corrections and practice sentence suggestions to CSV
+"""
+import os
+import datetime
+import sqlite3
+import requests
+import random
+import pandas as pd
+import pdfplumber
+import spacy
+import csv
+# SQLite is accessed via the built-in sqlite3 module (no need to install sqlite3-binary)
+import sqlite3
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from transformers import AutoTokenizer, BartForConditionalGeneration, AutoModelForCausalLM
+import gradio as gr
+import gradio_client.utils as _gcu
+# --- PATCH for Gradio utils schema bug ---
+_orig_json = _gcu.json_schema_to_python_type
+_orig_get = _gcu.get_type
+def _patched_json_to_py(schema, defs=None):
+    if not isinstance(schema, dict):
+        return "any"
+    try:
+        return _orig_json(schema, defs)
+    except Exception:
+        return "any"
+def _patched_get_type(schema):
+    if not isinstance(schema, dict):
+        return "any"
+    try:
+        return _orig_get(schema)
+    except Exception:
+        return "any"
+_gcu.json_schema_to_python_type = _patched_json_to_py
+_gcu.get_type = _patched_get_type
+# --- SQLite Database initialization ---
+DB_NAME = "vocabulary.db"
+conn = sqlite3.connect(DB_NAME)
+conn.execute("""
+    CREATE TABLE IF NOT EXISTS vocabulary (
+      user_id   TEXT,
+      language  TEXT,
+      word      TEXT,
+      timestamp TEXT,
+      UNIQUE(user_id, language, word)
+    )
+""")
+conn.commit()
+conn.close()
+# --- Save word to database ---
+def save_word_to_db(user_id: str, language: str, word: str):
+    ts = datetime.datetime.now().isoformat()
+    conn = sqlite3.connect(DB_NAME)
+    conn.execute(
+        "INSERT OR IGNORE INTO vocabulary (user_id, language, word, timestamp) VALUES (?, ?, ?, ?)",
+        (user_id, language, word, ts)
+    )
+    conn.commit()
+    conn.close()
+# --- Retrieve known words for user/language ---
+def get_user_vocabulary(user_id: str, language: str) -> set[str]:
+    conn = sqlite3.connect(DB_NAME)
+    rows = conn.execute(
+        "SELECT word FROM vocabulary WHERE user_id=? AND language=?",
+        (user_id, language)
+    ).fetchall()
+    conn.close()
+    return {r[0] for r in rows}
+# --- Load NLP models ---
+nlp = spacy.load("es_core_news_lg")
+tokenizer = AutoTokenizer.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
+model     = BartForConditionalGeneration.from_pretrained("SkitCon/gec-spanish-BARTO-COWS-L2H")
+gpt2_tokenizer_es = AutoTokenizer.from_pretrained("mrm8488/spanish-gpt2")
+gpt2_model_es     = AutoModelForCausalLM.from_pretrained("mrm8488/spanish-gpt2")
+gpt2_tokenizer_en = AutoTokenizer.from_pretrained("gpt2")
+gpt2_model_en     = AutoModelForCausalLM.from_pretrained("gpt2")
+lemmatizer        = WordNetLemmatizer()
+# ---Log to CSV (grammar corrections and sentence suggestions)  ---
+def log_to_csv(filename, row, fieldnames):
+    file_exists = os.path.isfile(filename)
+    with open(filename, "a", newline='', encoding="utf-8") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(row)
+# --- File Import ---
+def import_file(path: str) -> str:
+    ext = os.path.splitext(path)[1].lower()
+    if ext == ".pdf":
+        pages = []
+        with pdfplumber.open(path) as pdf:
+            for p in pdf.pages:
+                pages.append(p.extract_text() or "")
+        return "\n".join(pages)
+    if ext == ".csv":
+        df = pd.read_csv(path)
+        if "text" in df:
+            return "\n".join(df["text"].astype(str))
+        raise ValueError("CSV saknar kolumnen 'text'.")
+    if ext == ".txt":
+        return open(path, encoding="utf-8").read()
+    raise ValueError(f"Okänt filformat: {ext}")
+# --- Grammar Correction ---
+def correct_grammar(text: str, language: str) -> str:
+    if language == "es":
+        corrected = []
+        for sent in nlp(text).sents:
+            s = sent.text.strip()
+            if not s: continue
+            inp = tokenizer(s, return_tensors="pt", truncation=True, padding=True)
+            out = model.generate(
+                **inp,
+                max_new_tokens=inp.input_ids.shape[1],
+                num_beams=5,
+                early_stopping=True
+            )
+            corrected.append(tokenizer.decode(out[0], skip_special_tokens=True))
+        return " ".join(corrected)
+    # English: LanguageTool API
+    resp = requests.post(
+        "https://api.languagetool.org/v2/check",
+        data={"text": text, "language": language}
+    ).json()
+    for m in reversed(resp.get("matches", [])):
+        off, ln = m["offset"], m["length"]
+        repls = m.get("replacements", [])
+        val = repls[0]["value"] if repls else ""
+        text = text[:off] + val + text[off+ln:]
+    return text
+# --- Analyze known and unknown words ---
+def analyze_text(text: str, user_id: str, language: str):
+    toks = word_tokenize(text)
+    lems = [lemmatizer.lemmatize(w.lower()) for w in toks if w.isalpha()]
+    vocab = get_user_vocabulary(user_id, language)
+    known   = [w for w in lems if w in vocab]
+    unknown = [w for w in lems if w not in vocab]
+    return known, unknown
+# --- Generate sentence using GPT2 based on unknown words ---
+def generate_coherent_sentence(text: str, user_id: str, language: str, num_unknown=2) -> str:
+    kn, un = analyze_text(text, user_id, language)
+    if not un:
+        return "Inga okända ord att generera mening med."
+    chosen = random.sample(un, min(num_unknown, len(un)))
+    if language == "es":
+        prompt = "Escribe una sola frase clara que incluya estas palabras: " + ", ".join(chosen) + "."
+        tokenizer = gpt2_tokenizer_es
+        model     = gpt2_model_es
+    else:
+        prompt = "Write one clear sentence that includes the following words: " + ", ".join(chosen) + "."
+        tokenizer = gpt2_tokenizer_en
+        model     = gpt2_model_en
+    inp = tokenizer(prompt, return_tensors="pt", truncation=True)
+    outs = model.generate(
+        **inp,
+        max_new_tokens=50,
+        do_sample=True,
+        top_k=50,
+        top_p=0.95
+    )
+    gen = tokenizer.decode(outs[0], skip_special_tokens=True)
+    body = gen[len(prompt):].strip() if gen.startswith(prompt) else gen.strip()
+    sentence = (body.split(".")[0].strip() + ".") if "." in body else body
+    if not any(c.isalpha() for c in sentence):
+        return "Misslyckades att generera meningsfull övningsmening."
+    return sentence
+# --- Gradio process callback ---
+def process(user, language, txt, file, do_grammar, do_save):
+    try:
+        if txt and txt.strip():
+            text = txt.strip()
+        elif file:
+            text = import_file(file.name)
+        else:
+            return "", "", "", "Ingen text angiven.", ""
+        out = correct_grammar(text, language) if do_grammar else text
+        kn, un = analyze_text(out, user, language)
+        status = ""
+        if do_save and un:
+            for w in un:
+                save_word_to_db(user, language, w)
+            status = f"Sparade {len(un)} ord."
+        # Logga grammatikrättning till CSV
+        log_to_csv(
+            "grammarlog.csv",
+            {
+                "user": user, "language": language, "input": text,
+                "output": out, "timestamp": datetime.datetime.now().isoformat()
+            },
+            ["user", "language", "input", "output", "timestamp"]
+        )
+        return out, ", ".join(kn), ", ".join(un), status, ""
+    except Exception as e:
+        import traceback
+        tb = traceback.format_exc()
+        return "", "", "", f"FEL i process:\n{tb}", ""
+# --- Sentence generation callback ---
+def coherent_fn(user, language, txt, num):
+    try:
+        suggestion = generate_coherent_sentence(txt or "", user, language, num)
+        # Logga övningsförslag till CSV
+        log_to_csv(
+            "sentencelog.csv",
+            {
+                "user": user, "language": language, "input": txt,
+                "output": suggestion, "timestamp": datetime.datetime.now().isoformat()
+            },
+            ["user", "language", "input", "output", "timestamp"]
+        )
+        return suggestion
+    except Exception as e:
+        return f"Fel vid generering: {e}"
+# --- Gradio UI ---
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("### 🌟 Språkinlärningsapp med användare & flerspråkighet")
+    with gr.Row():
+        user_input  = gr.Textbox(label="Användarnamn", placeholder="Ditt namn här")
+        lang_dd     = gr.Dropdown(choices=["es", "en"], value="es", label="Språk")
+    with gr.Column():
+        manual_input = gr.Textbox(lines=4, label="Skriv/klistra in text")
+        file_input   = gr.File(file_types=[".txt",".csv",".pdf"], label="Importera fil")
+        grammar_cb   = gr.Checkbox(label="Grammatikrättning")
+        autosave_cb  = gr.Checkbox(label="Spara okända ord")
+        run_btn      = gr.Button("Kör analys & korrigering")
+        num_slider   = gr.Slider(minimum=1, maximum=5, step=1, value=2, label="Antal okända ord för övning")
+        coherent_btn = gr.Button("Koherent övningsmening")
+    corr_out    = gr.Textbox(label="Korrigerad text", lines=4)
+    known_out   = gr.Textbox(label="Kända ord")
+    unknown_out = gr.Textbox(label="Okända ord")
+    status_out  = gr.Textbox(label="Status")
+    coherent_out = gr.Textbox(label="Koherent övningsmening")
+    # --- Knapparnas click‐kopplingar ---
+    run_btn.click(
+        fn=process,
+        inputs=[user_input, lang_dd, manual_input, file_input, grammar_cb, autosave_cb],
+        outputs=[corr_out, known_out, unknown_out, status_out, coherent_out]
+    )
+    coherent_btn.click(
+        fn=coherent_fn,
+        inputs=[user_input, lang_dd, manual_input, num_slider],
+        outputs=[coherent_out]
+    )
+    #Make sure to change language for the textfile to be analyzed in its target language
+# --- Start app ---
+if __name__ == "__main__":
+    url = demo.launch(share=True, inbrowser=True, prevent_thread_lock=True)
+    print("Appen körs på:", url)