"""Shared constants, filesystem paths, and static data tables for EDEN. The workspace root defaults to the current working directory and can be moved by setting the ``EDEN_HOME`` environment variable. All training artifacts live under ``$EDEN_HOME/eden_system``. """ from __future__ import annotations import os from pathlib import Path ROOT = Path(os.environ.get("EDEN_HOME", Path.cwd())).resolve() SYSTEM_DIR = ROOT / "eden_system" DATA_DIR = SYSTEM_DIR / "data" CHECKPOINT_DIR = SYSTEM_DIR / "checkpoints" SESSIONS_DIR = SYSTEM_DIR / "training_sessions" EXPORT_DIR = SYSTEM_DIR / "exports" RUN_DIR = SYSTEM_DIR / "run" PAIRS_PATH = DATA_DIR / "pairs.jsonl" TOKENIZER_PATH = DATA_DIR / "tokenizer.json" CONFIG_PATH = DATA_DIR / "train_config.json" METRICS_PATH = RUN_DIR / "metrics.jsonl" STATE_PATH = RUN_DIR / "state.json" TRAIN_LOG_PATH = RUN_DIR / "train.log" PAUSE_REQUEST_PATH = RUN_DIR / "pause.request" SPECIAL_TOKENS = ["[UNK]", "[PAD]", "[BOS]", "[EOS]"] UNK_ID = 0 PAD_ID = 1 BOS_ID = 2 EOS_ID = 3 DATASET_SOURCES = [ { "name": "JFLEG", "detail": "Grammar-correction examples that teach rough sentence to corrected sentence patterns.", }, { "name": "Grammarly CoEdIT", "detail": "Correction and rewrite tasks for polishing, clarity, and instruction-style edits.", }, { "name": "W&I/LOCNESS", "detail": "Learner-English correction examples, loaded when the dataset is available.", }, { "name": "ASSET", "detail": "Sentence simplification and rewrite examples for cleaner wording.", }, { "name": "WikiSplit", "detail": "Sentence-flow examples that help with paragraph and sentence structure.", }, { "name": "MRPC", "detail": "Paraphrase pairs that teach meaning-preserving rewording.", }, { "name": "Seed writing examples", "detail": "Small built-in clean sentences used for offline setup and everyday writing style.", }, { "name": "Synthetic text noise", "detail": "Generated typos, dyslexia-like swaps, punctuation fixes, capitalization, and identity-preservation pairs.", }, ] KEYBOARD_ADJ = { "q": "wa", "w": "qeas", "e": "wrds", "r": "etdf", "t": "ryfg", "y": "tugh", "u": "yihj", "i": "uojk", "o": "ipkl", "p": "ol", "a": "qwsz", "s": "awedxz", "d": "serfcx", "f": "drtgvc", "g": "ftyhbv", "h": "gyujnb", "j": "huiknm", "k": "jiolm", "l": "kop", "z": "asx", "x": "zsdc", "c": "xdfv", "v": "cfgb", "b": "vghn", "n": "bhjm", "m": "njk", } LETTER_SWAPS = { "b": "d", "d": "b", "p": "q", "q": "p", "n": "u", "u": "n", "m": "w", "w": "m", } HOMOPHONES = [ ("their", "there"), ("their", "they're"), ("there", "their"), ("there", "they're"), ("your", "you're"), ("you're", "your"), ("its", "it's"), ("it's", "its"), ("to", "too"), ("too", "to"), ("then", "than"), ("than", "then"), ("affect", "effect"), ("effect", "affect"), ("loose", "lose"), ("lose", "loose"), ("weather", "whether"), ("whether", "weather"), ("were", "we're"), ("where", "were"), ("quiet", "quite"), ("quite", "quiet"), ] COMMON_TYPOS = { "the": ["teh", "hte", "th"], "and": ["adn", "nad", "an"], "that": ["taht", "tath", "thta"], "with": ["wiht", "whit", "wth"], "have": ["hvae", "ahve", "hve"], "this": ["tihs", "htis", "ths"], "from": ["form", "fomr", "frmo"], "their": ["thier", "tehir", "theri"], "which": ["whihc", "wihch", "wich"], "because": ["becuase", "becasue", "bc", "cuz", "coz"], "people": ["poeple", "pepole", "peolpe"], "should": ["shoud", "shold", "shoulld"], "would": ["wuold", "wolud", "woud"], "could": ["cuold", "colud", "coud"], "really": ["realy", "relly", "realli"], "everything": ["evrything", "everthing", "everythin"], "something": ["somthing", "sumthing", "soemthing"], "different": ["diffrent", "diferent", "difrent"], "important": ["importnat", "impotant", "importent"], "friend": ["freind", "frined", "frend"], "through": ["thorugh", "throuhg", "thru"], "write": ["rite", "wrte", "witre"], "writing": ["writting", "riteing", "writng"], "sentence": ["sentance", "sentnce", "sentense"], "paragraph": ["paragraf", "pargraph", "paragragh"], "structure": ["strucher", "structer", "strucutre"], "possible": ["posable", "possable", "posible"], "computer": ["coumputer", "compter", "comptuer"], "definitely": ["definately", "definatly", "definitly"], "separate": ["seperate", "seperete"], "necessary": ["neccessary", "necesary", "neccesary"], "believe": ["beleive", "belive"], "environment": ["enviroment", "enviornment"], "opportunity": ["oppurtunity", "oportunity"], "successful": ["succesful", "sucessful", "successfull"], "probably": ["probly", "probaly"], } SEED_CLEAN_SENTENCES = [ "I want to learn how to spell better and write stronger sentences.", "This paragraph has a few good ideas, but it needs clearer structure and smoother wording.", "Please revise this message so it sounds natural, polite, and easy to understand.", "The project is almost finished, but we still need to fix several bugs before launch.", "Training a model takes patience because it learns patterns from many examples over time.", "The meeting was helpful because everyone explained their concerns clearly.", "I am working on a tool that improves spelling, grammar, punctuation, and paragraph flow.", "Good writing keeps the original meaning while making the sentence easier to read.", "The assistant should correct mistakes without changing the user's voice too much.", "When a sentence is confusing, a careful rewrite can make the idea much clearer.", "I had trouble explaining the issue, so I rewrote the paragraph with more detail.", "The final version should sound confident, accurate, and friendly.", ]