{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 37 }, "executionInfo": { "elapsed": 258, "status": "ok", "timestamp": 1749836201800, "user": { "displayName": "Sherbrooke Informatique", "userId": "17298855329887496844" }, "user_tz": 240 }, "id": "DP6O5SJKDNnX", "outputId": "f2675a50-5f3b-4352-e74e-96f0ce1a6ee4" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'hf_REDACTED_TOKEN'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from google.colab import userdata\n", "userdata.get('HF_TOKEN')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 216 }, "executionInfo": { "elapsed": 43, "status": "error", "timestamp": 1749836254070, "user": { "displayName": "Sherbrooke Informatique", "userId": "17298855329887496844" }, "user_tz": 240 }, "id": "kDiOO10WH4va", "outputId": "f3336b84-e95f-4814-f2a3-9bb478089e2e" }, "outputs": [ { "ename": "NameError", "evalue": "name 'load_dataset' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# Chargement et split du dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"opus_books\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"en-fr\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"train\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.05\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0ms2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"train\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_test_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.05\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m42\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'load_dataset' is not defined" ] } ], "source": [ "# Auth sur Hug‐Face si HF_TOKEN est défini dans Kaggle Secrets\n", "try:\n", " hf_token = UserSecretsClient().get_secret(\"HF_TOKEN\")\n", " HfFolder.save_token(hf_token)\n", "except:\n", " pass\n", "\n", "# Configuration\n", "MODEL = \"Helsinki-NLP/opus-mt-fr-en\"\n", "SRC, TGT = \"fr\", \"en\"\n", "BATCH = 32\n", "EPOCHS = 3\n", "OUTPUT = \"opus-mt-fr-en-colab\"\n", "\n", "# Chargement et split du dataset\n", "ds = load_dataset(\"opus_books\", \"en-fr\")\n", "s = ds[\"train\"].train_test_split(0.05, seed=42)\n", "s2 = s[\"train\"].train_test_split(0.05, seed=42)\n", "raw = DatasetDict({\"train\": s2[\"train\"], \"validation\": s2[\"test\"], \"test\": s[\"test\"]})\n", "\n", "# Tokenizer\n", "tok = AutoTokenizer.from_pretrained(MODEL)\n", "def preprocess(ex):\n", " srcs = [t[SRC] for t in ex[\"translation\"]]\n", " tgts = [t[TGT] for t in ex[\"translation\"]]\n", " mi = tok(srcs, max_length=128, truncation=True, padding=False)\n", " lb = tok(text_target=tgts, max_length=128, truncation=True, padding=False)\n", " mi[\"labels\"] = lb[\"input_ids\"]\n", " return mi\n", "\n", "tokenized = raw.map(preprocess, batched=True, remove_columns=raw[\"train\"].column_names)\n", "\n", "# Modèle + DataCollator + Métriques\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model = AutoModelForSeq2SeqLM.from_pretrained(MODEL).to(device)\n", "dc = DataCollatorForSeq2Seq(tok, model=model)\n", "sacrebleu = evaluate.load(\"sacrebleu\")\n", "\n", "def compute_metrics(p):\n", " preds, labels = p.predictions, p.label_ids\n", " if isinstance(preds, tuple): preds = preds[0]\n", " preds = np.where(preds != -100, preds, tok.pad_token_id)\n", " labels = np.where(labels != -100, labels, tok.pad_token_id)\n", " dp = tok.batch_decode(preds, skip_special_tokens=True)\n", " dl = tok.batch_decode(labels, skip_special_tokens=True)\n", " return {\"bleu\": sacrebleu.compute(predictions=dp, references=[[l] for l in dl])[\"score\"]}\n", "\n", "args = Seq2SeqTrainingArguments(\n", " output_dir=OUTPUT,\n", " evaluation_strategy=\"epoch\",\n", " save_strategy=\"epoch\",\n", " per_device_train_batch_size=BATCH,\n", " per_device_eval_batch_size=BATCH,\n", " learning_rate=2e-5,\n", " num_train_epochs=EPOCHS,\n", " predict_with_generate=True,\n", " fp16=torch.cuda.is_available(),\n", " load_best_model_at_end=True,\n", " metric_for_best_model=\"bleu\",\n", " push_to_hub=hf_token is not None,\n", " hub_model_id=\"DomLoyer/\" + OUTPUT # remplace USERNAME par ton nom\n", ")\n", "\n", "trainer = Seq2SeqTrainer(\n", " model=model, args=args,\n", " train_dataset=tokenized[\"train\"],\n", " eval_dataset=tokenized[\"validation\"],\n", " tokenizer=tok, data_collator=dc,\n", " compute_metrics=compute_metrics\n", ")\n", "\n", "# Lancement\n", "trainer.train()\n", "trainer.save_model()\n", "print(trainer.predict(tokenized[\"test\"], metric_key_prefix=\"test\").metrics)\n", "\n", "# Inférence\n", "for s in [\"Bonjour le monde\", \"J'espère BLEU ~40\", \"Bonne traduction !\"]:\n", " out = model.generate(**tok(s, return_tensors=\"pt\", truncation=True).to(device),\n", " max_length=128, num_beams=4)\n", " print(f\"{s} → {tok.decode(out[0], skip_special_tokens=True)}\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "executionInfo": { "elapsed": 29743, "status": "ok", "timestamp": 1749836237637, "user": { "displayName": "Sherbrooke Informatique", "userId": "17298855329887496844" }, "user_tz": 240 }, "id": "k6MeElK2WK08", "outputId": "94355224-4616-473b-dade-c3b91fd127e6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting datasets\n", " Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)\n", "Collecting huggingface_hub\n", " Using cached huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)\n", "Collecting fsspec\n", " Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)\n", "Collecting filelock (from datasets)\n", " Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)\n", "Collecting numpy>=1.17 (from datasets)\n", " Using cached numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)\n", "Collecting pyarrow>=15.0.0 (from datasets)\n", " Using cached pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)\n", "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", " Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", "Collecting pandas (from datasets)\n", " Using cached pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)\n", "Collecting requests>=2.32.2 (from datasets)\n", " Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)\n", "Collecting tqdm>=4.66.3 (from datasets)\n", " Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)\n", "Collecting xxhash (from datasets)\n", " Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Collecting multiprocess<0.70.17 (from datasets)\n", " Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n", "Collecting fsspec\n", " Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)\n", "Collecting packaging (from datasets)\n", " Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)\n", "Collecting pyyaml>=5.1 (from datasets)\n", " Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)\n", "Collecting typing-extensions>=3.7.4.3 (from huggingface_hub)\n", " Using cached typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)\n", "Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)\n", " Using cached hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)\n", "Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached aiohttp-3.12.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)\n", "Collecting charset_normalizer<4,>=2 (from requests>=2.32.2->datasets)\n", " Using cached charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)\n", "Collecting idna<4,>=2.5 (from requests>=2.32.2->datasets)\n", " Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)\n", "Collecting urllib3<3,>=1.21.1 (from requests>=2.32.2->datasets)\n", " Using cached urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)\n", "Collecting certifi>=2017.4.17 (from requests>=2.32.2->datasets)\n", " Using cached certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)\n", "Collecting python-dateutil>=2.8.2 (from pandas->datasets)\n", " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", "Collecting pytz>=2020.1 (from pandas->datasets)\n", " Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", "Collecting tzdata>=2022.7 (from pandas->datasets)\n", " Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", "Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)\n", "Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)\n", "Collecting attrs>=17.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached attrs-25.3.0-py3-none-any.whl.metadata (10 kB)\n", "Collecting frozenlist>=1.1.1 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", "Collecting multidict<7.0,>=4.5 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached multidict-6.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)\n", "Collecting propcache>=0.2.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Collecting yarl<2.0,>=1.17.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", " Using cached yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (73 kB)\n", "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas->datasets)\n", " Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n", "Using cached datasets-3.6.0-py3-none-any.whl (491 kB)\n", "Using cached huggingface_hub-0.33.0-py3-none-any.whl (514 kB)\n", "Using cached fsspec-2025.3.0-py3-none-any.whl (193 kB)\n", "Using cached dill-0.3.8-py3-none-any.whl (116 kB)\n", "Using cached hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n", "Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)\n", "Using cached numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)\n", "Using cached packaging-25.0-py3-none-any.whl (66 kB)\n", "Using cached pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (42.3 MB)\n", "Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)\n", "Using cached requests-2.32.4-py3-none-any.whl (64 kB)\n", "Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)\n", "Using cached typing_extensions-4.14.0-py3-none-any.whl (43 kB)\n", "Using cached filelock-3.18.0-py3-none-any.whl (16 kB)\n", "Using cached pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)\n", "Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "Using cached aiohttp-3.12.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n", "Using cached certifi-2025.4.26-py3-none-any.whl (159 kB)\n", "Using cached charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (147 kB)\n", "Using cached idna-3.10-py3-none-any.whl (70 kB)\n", "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", "Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", "Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)\n", "Using cached urllib3-2.4.0-py3-none-any.whl (128 kB)\n", "Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)\n", "Using cached aiosignal-1.3.2-py2.py3-none-any.whl (7.6 kB)\n", "Using cached attrs-25.3.0-py3-none-any.whl (63 kB)\n", "Using cached frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (235 kB)\n", "Using cached multidict-6.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (223 kB)\n", "Using cached propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)\n", "Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)\n", "Using cached yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (348 kB)\n", "Installing collected packages: pytz, xxhash, urllib3, tzdata, typing-extensions, tqdm, six, pyyaml, pyarrow, propcache, packaging, numpy, multidict, idna, hf-xet, fsspec, frozenlist, filelock, dill, charset_normalizer, certifi, attrs, aiohappyeyeballs, yarl, requests, python-dateutil, multiprocess, aiosignal, pandas, huggingface_hub, aiohttp, datasets\n", " Attempting uninstall: pytz\n", " Found existing installation: pytz 2025.2\n", " Uninstalling pytz-2025.2:\n", " Successfully uninstalled pytz-2025.2\n", " Attempting uninstall: xxhash\n", " Found existing installation: xxhash 3.5.0\n", " Uninstalling xxhash-3.5.0:\n", " Successfully uninstalled xxhash-3.5.0\n", " Attempting uninstall: urllib3\n", " Found existing installation: urllib3 2.4.0\n", " Uninstalling urllib3-2.4.0:\n", " Successfully uninstalled urllib3-2.4.0\n", " Attempting uninstall: tzdata\n", " Found existing installation: tzdata 2025.2\n", " Uninstalling tzdata-2025.2:\n", " Successfully uninstalled tzdata-2025.2\n", " Attempting uninstall: typing-extensions\n", " Found existing installation: typing_extensions 4.14.0\n", " Uninstalling typing_extensions-4.14.0:\n", " Successfully uninstalled typing_extensions-4.14.0\n", " Attempting uninstall: tqdm\n", " Found existing installation: tqdm 4.67.1\n", " Uninstalling tqdm-4.67.1:\n", " Successfully uninstalled tqdm-4.67.1\n", " Attempting uninstall: six\n", " Found existing installation: six 1.17.0\n", " Uninstalling six-1.17.0:\n", " Successfully uninstalled six-1.17.0\n", " Attempting uninstall: pyyaml\n", " Found existing installation: PyYAML 6.0.2\n", " Uninstalling PyYAML-6.0.2:\n", " Successfully uninstalled PyYAML-6.0.2\n", " Attempting uninstall: pyarrow\n", " Found existing installation: pyarrow 20.0.0\n", " Uninstalling pyarrow-20.0.0:\n", " Successfully uninstalled pyarrow-20.0.0\n", " Attempting uninstall: propcache\n", " Found existing installation: propcache 0.3.2\n", " Uninstalling propcache-0.3.2:\n", " Successfully uninstalled propcache-0.3.2\n", " Attempting uninstall: packaging\n", " Found existing installation: packaging 25.0\n", " Uninstalling packaging-25.0:\n", " Successfully uninstalled packaging-25.0\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 2.3.0\n", " Uninstalling numpy-2.3.0:\n", " Successfully uninstalled numpy-2.3.0\n", " Attempting uninstall: multidict\n", " Found existing installation: multidict 6.4.4\n", " Uninstalling multidict-6.4.4:\n", " Successfully uninstalled multidict-6.4.4\n", " Attempting uninstall: idna\n", " Found existing installation: idna 3.10\n", " Uninstalling idna-3.10:\n", " Successfully uninstalled idna-3.10\n", " Attempting uninstall: hf-xet\n", " Found existing installation: hf-xet 1.1.3\n", " Uninstalling hf-xet-1.1.3:\n", " Successfully uninstalled hf-xet-1.1.3\n", " Attempting uninstall: fsspec\n", " Found existing installation: fsspec 2025.3.0\n", " Uninstalling fsspec-2025.3.0:\n", " Successfully uninstalled fsspec-2025.3.0\n", " Attempting uninstall: frozenlist\n", " Found existing installation: frozenlist 1.7.0\n", " Uninstalling frozenlist-1.7.0:\n", " Successfully uninstalled frozenlist-1.7.0\n", " Attempting uninstall: filelock\n", " Found existing installation: filelock 3.18.0\n", " Uninstalling filelock-3.18.0:\n", " Successfully uninstalled filelock-3.18.0\n", " Attempting uninstall: dill\n", " Found existing installation: dill 0.3.8\n", " Uninstalling dill-0.3.8:\n", " Successfully uninstalled dill-0.3.8\n", " Attempting uninstall: charset_normalizer\n", " Found existing installation: charset-normalizer 3.4.2\n", " Uninstalling charset-normalizer-3.4.2:\n", " Successfully uninstalled charset-normalizer-3.4.2\n", " Attempting uninstall: certifi\n", " Found existing installation: certifi 2025.4.26\n", " Uninstalling certifi-2025.4.26:\n", " Successfully uninstalled certifi-2025.4.26\n", " Attempting uninstall: attrs\n", " Found existing installation: attrs 25.3.0\n", " Uninstalling attrs-25.3.0:\n", " Successfully uninstalled attrs-25.3.0\n", " Attempting uninstall: aiohappyeyeballs\n", " Found existing installation: aiohappyeyeballs 2.6.1\n", " Uninstalling aiohappyeyeballs-2.6.1:\n", " Successfully uninstalled aiohappyeyeballs-2.6.1\n", " Attempting uninstall: yarl\n", " Found existing installation: yarl 1.20.1\n", " Uninstalling yarl-1.20.1:\n", " Successfully uninstalled yarl-1.20.1\n", " Attempting uninstall: requests\n", " Found existing installation: requests 2.32.4\n", " Uninstalling requests-2.32.4:\n", " Successfully uninstalled requests-2.32.4\n", " Attempting uninstall: python-dateutil\n", " Found existing installation: python-dateutil 2.9.0.post0\n", " Uninstalling python-dateutil-2.9.0.post0:\n", " Successfully uninstalled python-dateutil-2.9.0.post0\n", " Attempting uninstall: multiprocess\n", " Found existing installation: multiprocess 0.70.16\n", " Uninstalling multiprocess-0.70.16:\n", " Successfully uninstalled multiprocess-0.70.16\n", " Attempting uninstall: aiosignal\n", " Found existing installation: aiosignal 1.3.2\n", " Uninstalling aiosignal-1.3.2:\n", " Successfully uninstalled aiosignal-1.3.2\n", " Attempting uninstall: pandas\n", " Found existing installation: pandas 2.3.0\n", " Uninstalling pandas-2.3.0:\n", " Successfully uninstalled pandas-2.3.0\n", " Attempting uninstall: huggingface_hub\n", " Found existing installation: huggingface-hub 0.33.0\n", " Uninstalling huggingface-hub-0.33.0:\n", " Successfully uninstalled huggingface-hub-0.33.0\n", " Attempting uninstall: aiohttp\n", " Found existing installation: aiohttp 3.12.12\n", " Uninstalling aiohttp-3.12.12:\n", " Successfully uninstalled aiohttp-3.12.12\n", " Attempting uninstall: datasets\n", " Found existing installation: datasets 3.6.0\n", " Uninstalling datasets-3.6.0:\n", " Successfully uninstalled datasets-3.6.0\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.0 which is incompatible.\n", "google-colab 1.0.0 requires requests==2.32.3, but you have requests 2.32.4 which is incompatible.\n", "pylibcudf-cu12 25.2.1 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == \"x86_64\", but you have pyarrow 20.0.0 which is incompatible.\n", "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\n", "langchain-core 0.3.63 requires packaging<25,>=23.2, but you have packaging 25.0 which is incompatible.\n", "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.3.0 which is incompatible.\n", "cudf-cu12 25.2.1 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.0 which is incompatible.\n", "cudf-cu12 25.2.1 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == \"x86_64\", but you have pyarrow 20.0.0 which is incompatible.\n", "cupy-cuda12x 13.3.0 requires numpy<2.3,>=1.22, but you have numpy 2.3.0 which is incompatible.\n", "numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.0 which is incompatible.\n", "dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed aiohappyeyeballs-2.6.1 aiohttp-3.12.12 aiosignal-1.3.2 attrs-25.3.0 certifi-2025.4.26 charset_normalizer-3.4.2 datasets-3.6.0 dill-0.3.8 filelock-3.18.0 frozenlist-1.7.0 fsspec-2025.3.0 hf-xet-1.1.3 huggingface_hub-0.33.0 idna-3.10 multidict-6.4.4 multiprocess-0.70.16 numpy-2.3.0 packaging-25.0 pandas-2.3.0 propcache-0.3.2 pyarrow-20.0.0 python-dateutil-2.9.0.post0 pytz-2025.2 pyyaml-6.0.2 requests-2.32.4 six-1.17.0 tqdm-4.67.1 typing-extensions-4.14.0 tzdata-2025.2 urllib3-2.4.0 xxhash-3.5.0 yarl-1.20.1\n" ] }, { "data": { "application/vnd.colab-display-data+json": { "id": "ae4d31dee6cd40b6a1fcf02dec1788df", "pip_warning": { "packages": [ "certifi", "dateutil", "numpy", "packaging", "six" ] } } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "!pip install --upgrade --force-reinstall datasets huggingface_hub fsspec" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "executionInfo": { "elapsed": 967, "status": "ok", "timestamp": 1749836238600, "user": { "displayName": "Sherbrooke Informatique", "userId": "17298855329887496844" }, "user_tz": 240 }, "id": "2Suudw5uV_j_" }, "outputs": [], "source": [ "from datasets import load_dataset, DatasetDict" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VpxBdNZ5W1D_" }, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "authorship_tag": "ABX9TyNhHG9w1J+5wHaNvPlhZ376", "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }