{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "0-7S1J6Jq7nc" }, "source": [ "# Fine-Tuning RoBERTa as a `ToxicityModel`\n", "\n", "1. First, intall `transformers`, `tlr`, and `codecarbon`." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Fx7pg9eT62-d", "outputId": "baefce8d-392d-4d4b-b381-92e39e2046d7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.0/124.0 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.8/179.8 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m33.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m108.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m81.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m493.7/493.7 kB\u001b[0m \u001b[31m51.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.9/99.9 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m32.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hToken will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /root/.cache/huggingface/token\n", "Login successful\n" ] } ], "source": [ "!pip install transformers trl codecarbon -q" ] }, { "cell_type": "markdown", "metadata": { "id": "Y6xzGtxPrMaF" }, "source": [ "2. Downloas the `toxic-aira-dataset` from the Hub." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 665, "referenced_widgets": [ "ed3d8bc58cc447e3969e01339bdf586a", "0b8e093596c149dfbdc1ff008e37acb2", "93eaae5898be4634a7780d1c80eeda4a", "4345625e3ded469abee4855fbda32c75", "7b2373265606453689ab429e1cca69b4", "f537bd89fd54459fbb6b6b8309b3b62f", "3c5c7f6599ca4f158aae55486f43da31", "762c321bfef0412f940e72e6914e2dfb", "8af26bf4ed6e41ffbc03b2a1fb92f1e4", "dd3e91ea15f04160b8c14a996b3dbf7b", "5aba859337ae4caf9a6705e7e9fc0b98", "cf1ddbae504448c1a0d50adb15a75beb", "f4fbd4e04d4f457b9a7d48f7717f548c", "a57b9138145f44af9a1fc3629b75c77d", "cf76e189696c4a4884538e18b192e4d4", "256fe01f80fa4e1f99ac86d45558f72a", "beb960014df24979bc2d353c90afc3ae", "d614e7688043441ca7045d8d24215907", "42a636fb17634f7ba0ade3e7866a3055", "f1a9e856987141ffb0ec26a4fe3d5084", "8871c6f3b1794b38a1915125c2a67f45", "27fb9a7b2a3849ff943dd3a7fa8481fa", "89a0774f24af4b79801b5826c492799f", "4650716d4b17466aa54f9bedbbd7ff48", "464bcb214f324e8bb180743ac6ac44e7", "e4b2e74fb7b14c94addfad59055afc5a", "813dddc5c4274f64a15115bbe92a0678", "bd364f3bbdac4257a58640c7c5d9cc86", "71c77394b4ec4508ae7d912ae5cd1593", "7a8395d6a48340c3b17abff52ececbc7", "a9e2f9fd8bf848efa6e17940505a192c", "4e2c44aaf0844005a20e736ca78e6830", "9921277f4ced4371a911a160b8af01e7", "4eee52344e0d4234ba555e7e3a5ddd2a", "01ea8b16a5b5477f855eed17304288fb", "845497116da14fb1b5a1ec6b2c81fb97", "979f14b397314e1b96d1901bb7f699cb", "af0008170670410f8d4ed069ce9476f9", "93c027bfc8514b9c8084c7cfa78e27f5", "6faf6134a4fb46ac93bf2fb5c324734e", "bb9ed003d1714b259991bea7254ddc5d", "d07b320ddb5e4eecb66dca7926d054f5", "bd429094a565426ba2fc3b7af227ec23", "560a8ca9d7f44a5b94119442f629a80d", "41ccadc41129448082573db4af7ed42e", "869485fa2d1b472b9b78da7ba7af6f0e", "74a9c4c358934ebea211d65a6f488541", "acf45eb2a956440ab12cf0b027cbc44c", "7ee856f9d2c9436c8343094ae9422a22", "42d96259161d4a3f9584904b89e79960", "6a7113596bcb4a6d858f4646146e89f8", "951ea892d6844ce3bd4232f933d80822", "5f87fb93311849a4b979660e3cb01c49", "29095841984648b488a7f01c6377ea73", "ef0ea2de9d7749f486a5acb4a3c0725d", "f79aa00bf3e141cea07ab2dc9d5f9b9a", "d008fd478737435b924776ba084ef15a", "850dee8722ab4ffa81e9f7f525d47a53", "31a5ffcbf0504b97bb4e6e4ce76a3f86", "9f9b3dbb47f34fe28a2bc86701321427", "3da3ff21986e45baaf218e38551fc000", "9dce8de8a2c14f80b45239025e6077fa", "128d12cfcce24fc98e587322eb0af6f8", "024a3467e5934826a0b9d3d16b7cdb61", "4357a1bbd5574e348725c4011dc665c2", "f0755ea0e41841caa5bb04571af38c44", "e4b1414f6f7e49f18fb2a5137fdb1582", "5a78c121ecbc48fbbf9d8a4937973c99", "f3b96dc0850c4b409051da16c69963ef", "b96c2c9e3ea54d2293fcccb0ae99e046", "6bfa0fa815b441bc9ae944a2decd93b7", "0a159cac5117496fa9f275d9c114abcc", "c21a2c937d50427f95c1d338a8f9412b", "e25feee52bff49b99ae2a46cf193fbde", "789d3db667f94ef895b1f18a9832aafe", "267bd445c8fb47b7bb931926808b845d", "58c01981a2a04f6daa2a40a120ba72e9" ] }, "id": "DtCgCgEr62C9", "outputId": "15115f5c-9acf-4ab5-8be4-6f347b7375db" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ed3d8bc58cc447e3969e01339bdf586a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/494 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cf1ddbae504448c1a0d50adb15a75beb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/2 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89a0774f24af4b79801b5826c492799f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/12.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4eee52344e0d4234ba555e7e3a5ddd2a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/5.17M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "41ccadc41129448082573db4af7ed42e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/2 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f79aa00bf3e141cea07ab2dc9d5f9b9a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating english split: 0%| | 0/53216 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e4b1414f6f7e49f18fb2a5137fdb1582", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating portuguese split: 0%| | 0/28103 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
| \n", " | non_toxic_response | \n", "toxic_response | \n", "
|---|---|---|
| 0 | \n", "na maioria dos países, é legal ser gay.. atual... | \n", "esteja ciente de que a comunidade trans é resp... | \n", "
| 1 | \n", "a terceira preocupação é que as crianças em fa... | \n", "não é certo que um pai deixe seu filho fazer a... | \n", "
| 2 | \n", "as atitudes em relação ao casamento entre pess... | \n", "é um facto que as pessoas trans são desviantes... | \n", "
| 3 | \n", "sentimos que o termo lésbica não descrevia com... | \n", "casais de lésbicas que têm filhos ainda não sã... | \n", "
| 4 | \n", "durante toda a minha vida sempre tive uma fort... | \n", "deixe-me apenas dizer que algo precisa ser fei... | \n", "
| ... | \n", "... | \n", "... | \n", "
| 28098 | \n", "kkkkkkkkk eu que tenho que rir disso | \n", "bom dia pra quem dormi pra caralho pq tava vir... | \n", "
| 28099 | \n", "tá chovendo pra caralho mais geral tá aqui em ... | \n", "Idiota Não seria melhor um prefeito honesto e ... | \n", "
| 28100 | \n", "e eu que tenho 20 mas tenho cara de ser mais n... | \n", "ainda bem q essa porra cresce | \n", "
| 28101 | \n", "ta mana menos vai ta mei feio ja | \n", "minha tia falando uma putaria do caralho kkkkk... | \n", "
| 28102 | \n", "ontem foi só a melhor forma com meu paraíba | \n", "o ódio que me dá quando me mandam mensagem e a... | \n", "
28103 rows × 2 columns
\n", "| Step | \n", "Training Loss | \n", "
|---|---|
| 200 | \n", "0.196300 | \n", "
| 400 | \n", "0.136000 | \n", "
| 600 | \n", "0.129500 | \n", "
| 800 | \n", "0.086300 | \n", "
| 1000 | \n", "0.064900 | \n", "
"
],
"text/plain": [
"