{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "torch.cuda.is_available()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check that we can load the tokenizer and the model.  The first time this runs it will take a while.  The files go under ~/.cache/huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
      "The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. \n",
      "The class this function is called from is 'LlamaTokenizer'.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3ab80e2a1c0744e0af747ba63429a2af",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import LlamaTokenizer, LlamaForCausalLM\n",
    "tokenizer = LlamaTokenizer.from_pretrained(\n",
    "    \"decapoda-research/llama-7b-hf\")\n",
    "   \n",
    "#tokenizer.pad_token_id = (0)\n",
    "#tokenizer.padding_side = 'left'\n",
    "model = LlamaForCausalLM.from_pretrained(\n",
    "    \"decapoda-research/llama-7b-hf\",\n",
    "    device_map=\"auto\",\n",
    "    torch_dtype=torch.float16\n",
    ")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Construct a basic example to interact with the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/matt/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/generation/utils.py:1220: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)\n",
      "  \"You have modified the pretrained model configuration to control generation. This is a\"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " ⁇  hey dude, talk to me.\n",
      "I'm a 20 year old guy from the UK. I'm a bit of a nerd, I like to read, I like to write, I like to play video games, I like to watch movies, I like to listen\n",
      " ⁇  whats the capital of georgia?\n",
      "What is the capital of Georgia?\n",
      "The capital of Georgia is Atlanta.\n",
      "What is the capital of Georgia?\n",
      "The capital of Georgia is Atlanta. The capital of Georgia is Atlanta. The capital of Georgia is Atlanta. The capital of Georgia is Atlanta. The\n"
     ]
    }
   ],
   "source": [
    "def ask(q,l=64):\n",
    "    toks = tokenizer(q , return_tensors='pt')\n",
    "    ctoks = toks.input_ids.to('cuda')\n",
    "    gen = model.generate(ctoks, max_length=64)\n",
    "    return tokenizer.decode(gen[0])\n",
    "\n",
    "r = ask('hey dude, talk to me')\n",
    "print(r)\n",
    "\n",
    "r = ask('whats the capital of georgia?')\n",
    "print(r)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get the WikiSQL project so we can get the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fatal: destination path 'WikiSQL' already exists and is not an empty directory.\n",
      "data/\n",
      "data/train.jsonl\n",
      "data/test.tables.jsonl\n",
      "data/test.db\n",
      "data/dev.tables.jsonl\n",
      "data/dev.db\n",
      "data/test.jsonl\n",
      "data/train.tables.jsonl\n",
      "data/train.db\n",
      "data/dev.jsonl\n"
     ]
    }
   ],
   "source": [
    "! git clone https://github.com/salesforce/WikiSQL\n",
    "! tar xvjf WikiSQL/data.tar.bz2"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Figure out what the actual data set has in it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'phase': 1, 'table_id': '1-1000181-1', 'question': 'Tell me what the notes are for South Australia ', 'sql': {'sel': 5, 'conds': [[3, 0, 'SOUTH AUSTRALIA']], 'agg': 0}}\n",
      "\n",
      "1-1000181-1\n",
      "['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes']\n",
      "\n",
      "{'id': '1-1000181-1', 'header': ['State/territory', 'Text/background colour', 'Format', 'Current slogan', 'Current series', 'Notes'], 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'rows': [['Australian Capital Territory', 'blue/white', 'Yaa·nna', 'ACT · CELEBRATION OF A CENTURY 2013', 'YIL·00A', 'Slogan screenprinted on plate'], ['New South Wales', 'black/yellow', 'aa·nn·aa', 'NEW SOUTH WALES', 'BX·99·HI', 'No slogan on current series'], ['New South Wales', 'black/white', 'aaa·nna', 'NSW', 'CPX·12A', 'Optional white slimline series'], ['Northern Territory', 'ochre/white', 'Ca·nn·aa', 'NT · OUTBACK AUSTRALIA', 'CB·06·ZZ', 'New series began in June 2011'], ['Queensland', 'maroon/white', 'nnn·aaa', 'QUEENSLAND · SUNSHINE STATE', '999·TLG', 'Slogan embossed on plate'], ['South Australia', 'black/white', 'Snnn·aaa', 'SOUTH AUSTRALIA', 'S000·AZD', 'No slogan on current series'], ['Victoria', 'blue/white', 'aaa·nnn', 'VICTORIA - THE PLACE TO BE', 'ZZZ·562', 'Current series will be exhausted this year']], 'name': 'table_1000181_1'}\n",
      "SELECT  col5 FROM table WHERE col3 = SOUTH AUSTRALIA\n",
      "SELECT  Notes FROM table WHERE Current slogan = SOUTH AUSTRALIA\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import sys\n",
    "import re\n",
    "sys.path.append('./WikiSQL')\n",
    "from WikiSQL.lib.query import Query\n",
    "\n",
    "def replace_cols(qs,cmap):\n",
    "    for k,v in cmap.items():\n",
    "        qs = re.sub(f' {k} ',f' {v} ',qs)\n",
    "    return qs\n",
    "\n",
    "with open('data/train.jsonl') as f:\n",
    "    j = f.readline()\n",
    "    js = json.loads(j)\n",
    "    print(js)\n",
    "    \n",
    "print()\n",
    "\n",
    "with open('data/train.tables.jsonl') as f:\n",
    "    j = f.readline()\n",
    "    js2 = json.loads(j)\n",
    "    #print(js)\n",
    "    print(js2['id'])\n",
    "    print(js2['header'])\n",
    "    print()\n",
    "    print(js2)\n",
    "\n",
    "sql = js['sql']\n",
    "q = Query.from_dict(sql)\n",
    "print(q)\n",
    "cm = {f'col{i}':js2['header'][i] for i in range(len(js2['header']))}\n",
    "qs = replace_cols(str(q),cm)\n",
    "print(qs)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ok, their query class deals poorly with stringifying the constraints.  Let's mock up natural language prompt and SQL response."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\n",
      "### Question: What is the Displacement of the Iveco F1CE3481E Engine?\n",
      "### Input: Table 2-1415821-6 has columns Model (text),Engine (text),Displacement (text),Valvetrain (text),Fuel system (text),Max. power at rpm (text),Max. torque at rpm (text).  \n",
      "### Answer: SELECT  Displacement FROM 2-1415821-6 WHERE Engine = 'iveco f1ce3481e'\n",
      "\n",
      "Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\n",
      "### Question: What is the record of team utah?\n",
      "### Input: Table 2-17355628-9 has columns Game (real),Date (text),Team (text),Score (text),High points (text),High rebounds (text),High assists (text),Location Attendance (text),Record (text).  \n",
      "### Answer: SELECT  Record FROM 2-17355628-9 WHERE Team = 'utah'\n",
      "\n",
      "Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\n",
      "### Question: What is the home of the team with a 16-8 record?\n",
      "### Input: Table 2-16188254-4 has columns Date (text),Visitor (text),Score (text),Home (text),Leading scorer (text),Attendance (text),Record (text).  \n",
      "### Answer: SELECT  Home FROM 2-16188254-4 WHERE Record = '16-8'\n",
      "\n",
      "Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\n",
      "### Question: What week did the Galaxy play the Amsterdam Admirals?\n",
      "### Input: Table 1-24814477-2 has columns Week (real),Date (text),Kickoff (text),Opponent (text),Final score (text),Team record (text),Game site (text),Attendance (real).  \n",
      "### Answer: SELECT  Week FROM 1-24814477-2 WHERE Opponent = 'Amsterdam Admirals'\n",
      "\n",
      "Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\n",
      "### Question: How many caps did Mitchell Duke have overall?\n",
      "### Input: Table 2-1257177-1 has columns Player (text),Country (text),Caps (real),Goals (text),Years Active (text).  \n",
      "### Answer: SELECT COUNT Caps FROM 2-1257177-1 WHERE Player = 'mitchell duke'\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "\n",
    "# defined by WikiSQL\n",
    "\n",
    "agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']\n",
    "cond_ops = ['=', '>', '<', 'OP']\n",
    "syms = ['SELECT', 'WHERE', 'AND', 'COL', 'TABLE', 'CAPTION', 'PAGE', 'SECTION', 'OP', 'COND', 'QUESTION', 'AGG', 'AGGOPS', 'CONDOPS']\n",
    "\n",
    "def fix_repr(d,cols,types,tid):\n",
    "    sel_index=d['sel'] \n",
    "    agg_index=d['agg']\n",
    "    conditions=d['conds']\n",
    "    col = cols[sel_index]\n",
    "    rep = 'SELECT {agg} {sel} FROM {tid}'.format(\n",
    "            agg=agg_ops[agg_index],\n",
    "            sel=col,\n",
    "            tid=tid\n",
    "            )\n",
    "    if conditions:\n",
    "        cs = []\n",
    "        for i, o, v in conditions:\n",
    "            #print(i,cols)\n",
    "            nm = cols[i]\n",
    "            op = cond_ops[o]\n",
    "            \n",
    "            if types[i] in ['text']:\n",
    "                val = f\"\\'{v}\\'\"\n",
    "            else:\n",
    "                val = v\n",
    "            cs.append(f'{nm} {op} {val}')\n",
    "        #print(cs)\n",
    "\n",
    "        rep +=  ' WHERE ' + ' AND '.join(cs)\n",
    "    \n",
    "    return rep\n",
    "\n",
    "tbl_cols = {}\n",
    "tbl_types = {}\n",
    "tbl_str = {}\n",
    "\n",
    "prefix = 'Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.'\n",
    "\n",
    "def tbl_def_to_string(id, header, types):\n",
    "    ht = [f'{header[i]} ({types[i]})' for i in range(len(header))]\n",
    "    s = f'\\n### Input: Table {id} has columns ' + ','.join(ht) + '.  '\n",
    "    return s\n",
    "\n",
    "with open('data/train.tables.jsonl') as f:\n",
    "    for line in f:\n",
    "        js = json.loads(line)\n",
    "        id = js['id']\n",
    "        hdr = js['header']\n",
    "        ts = js['types']\n",
    "        tbl_str[id] = tbl_def_to_string(id,hdr,ts)\n",
    "        tbl_cols[id] = hdr\n",
    "        tbl_types[id] = ts\n",
    "\n",
    "\n",
    "nl_q = []\n",
    "sql_a = []\n",
    "\n",
    "with open('data/train.jsonl') as f:\n",
    "    for line in f:\n",
    "        js = json.loads(line)\n",
    "        id = js['table_id']\n",
    "        s = tbl_str[id]\n",
    "        qst = js['question']\n",
    "        nl = prefix + \"\\n### Question: \" + qst + s\n",
    "        nl_q.append(nl)\n",
    "\n",
    "        sql = js['sql']\n",
    "        a = fix_repr(sql,tbl_cols[id],tbl_types[id],id)\n",
    "        a = '\\n### Answer: ' + a\n",
    "        sql_a.append(a)\n",
    "\n",
    "\n",
    "M = len(nl_q)\n",
    "\n",
    "data_txt = [nl_q[i] + sql_a[i] for i in range(len(nl_q))]\n",
    "\n",
    "for i in range(5):\n",
    "    j = random.randint(0,M-1)\n",
    "    print()\n",
    "    print(data_txt[j]) \n",
    "        \n",
    "   "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set up the details for the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4f44918087484dd58b958a64cabdecb6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/56355 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from peft import LoraConfig, get_peft_model\n",
    "import transformers\n",
    "import datasets\n",
    "\n",
    "LORA_R = 4\n",
    "LORA_ALPHA = 16\n",
    "LORA_DROPOUT = .1\n",
    "CUTOFF_LEN = 256\n",
    "BATCH = 128\n",
    "MICRO_BATCH = 4\n",
    "N_GAS = BATCH//MICRO_BATCH\n",
    "EPOCHS = 1\n",
    "LR = 1e-5\n",
    "\n",
    "lora_cfg = LoraConfig(\n",
    "    r = LORA_R,\n",
    "    lora_alpha=LORA_ALPHA,\n",
    "    lora_dropout=LORA_DROPOUT,\n",
    "    task_type='CASUAL_LM',\n",
    "    target_modules=['q_proj','v_proj']\n",
    ")\n",
    "\n",
    "modad = get_peft_model(model,lora_cfg)\n",
    "\n",
    "tokenizer.pad_token_id = 0\n",
    "\n",
    "d = {'prompt': data_txt}\n",
    "\n",
    "data = datasets.Dataset.from_dict(d)\n",
    "data = data.map(lambda x:\n",
    "        tokenizer(\n",
    "        x['prompt'],\n",
    "        truncation=True,\n",
    "        max_length=CUTOFF_LEN,\n",
    "        padding=\"max_length\"\n",
    "        ))\n",
    "\n",
    "#data.remove_columns('prompt')\n",
    "\n",
    "targs = transformers.TrainingArguments(\n",
    "    per_device_train_batch_size=MICRO_BATCH,\n",
    "    gradient_accumulation_steps=N_GAS,\n",
    "    warmup_steps=0,\n",
    "    num_train_epochs=EPOCHS,\n",
    "    learning_rate=LR,\n",
    "    fp16=True,\n",
    "    logging_steps=1,\n",
    "    output_dir='sqllama-out',\n",
    "    save_total_limit=3,\n",
    "    remove_unused_columns=False\n",
    ")\n",
    "\n",
    "\n",
    "modad.config.use_cache = False"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ignore - just trying to figure out huggingface datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['prompt', 'input_ids', 'attention_mask'],\n",
      "    num_rows: 56355\n",
      "})\n",
      "{'prompt': \"Below is a question that describes a data request, paired with an input that describes a SQL table.  Write a SQL query that retrieves the data.\\n### Question: Tell me what the notes are for South Australia \\n### Input: Table 1-1000181-1 has columns State/territory (text),Text/background colour (text),Format (text),Current slogan (text),Current series (text),Notes (text).  \\n### Answer: SELECT  Notes FROM 1-1000181-1 WHERE Current slogan = 'SOUTH AUSTRALIA'\", 'input_ids': [0, 13866, 338, 263, 1139, 393, 16612, 263, 848, 2009, 29892, 3300, 2859, 411, 385, 1881, 393, 16612, 263, 3758, 1591, 29889, 29871, 14350, 263, 3758, 2346, 393, 5663, 17180, 278, 848, 29889, 13, 2277, 29937, 894, 29901, 24948, 592, 825, 278, 11486, 526, 363, 4275, 8314, 29871, 13, 2277, 29937, 10567, 29901, 6137, 29871, 29896, 29899, 29896, 29900, 29900, 29900, 29896, 29947, 29896, 29899, 29896, 756, 4341, 4306, 29914, 357, 768, 706, 313, 726, 511, 1626, 29914, 7042, 12384, 313, 726, 511, 5809, 313, 726, 511, 7583, 269, 1188, 273, 313, 726, 511, 7583, 3652, 313, 726, 511, 3664, 267, 313, 726, 467, 259, 13, 2277, 29937, 673, 29901, 5097, 29871, 8695, 3895, 29871, 29896, 29899, 29896, 29900, 29900, 29900, 29896, 29947, 29896, 29899, 29896, 5754, 9626, 269, 1188, 273, 353, 525, 6156, 2692, 29950, 319, 29965, 10810, 1964, 10764, 29915, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}\n"
     ]
    }
   ],
   "source": [
    "print(data)\n",
    "print(data[0])\n",
    "\n",
    "#from datasets import load_dataset\n",
    "\n",
    "\n",
    "#!git clone https://github.com/tloen/alpaca-lora.git\n",
    "#dalp = load_dataset(\"json\", data_files=\"alpaca-lora/alpaca_data.json\")\n",
    "#print(dalp)\n",
    "\n",
    "#dalp = dalp.map(lambda x : {'blah':'blah'})\n",
    "#print(dalp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/matt/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/optimization.py:395: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`prompt` in this case) have excessive nesting (inputs type `list` where type `int` is expected).",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36mconvert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m    716\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_tensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 717\u001b[0;31m                     \u001b[0mtensor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mas_tensor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    718\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: too many dimensions 'str'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m/var/tmp/ipykernel_2309/3549391384.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mdata_collator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransformers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataCollatorForLanguageModeling\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m )\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mtrainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'sqllama-out'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1664\u001b[0m             \u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresume_from_checkpoint\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1665\u001b[0m             \u001b[0mtrial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrial\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1666\u001b[0;31m             \u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_keys_for_eval\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1667\u001b[0m         )\n\u001b[1;32m   1668\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/trainer.py\u001b[0m in \u001b[0;36m_inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1897\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1898\u001b[0m             \u001b[0mstep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1899\u001b[0;31m             \u001b[0;32mfor\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mepoch_iterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1900\u001b[0m                 \u001b[0mtotal_batched_samples\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1901\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mrng_to_sync\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    626\u001b[0m                 \u001b[0;31m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    627\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 628\u001b[0;31m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    629\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_num_yielded\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    630\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_kind\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_DatasetKind\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    669\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    670\u001b[0m         \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 671\u001b[0;31m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_fetcher\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfetch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# may raise StopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    672\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pin_memory\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    673\u001b[0m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpin_memory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pin_memory_device\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py\u001b[0m in \u001b[0;36mfetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m     59\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpossibly_batched_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcollate_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/data/data_collator.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, features, return_tensors)\u001b[0m\n\u001b[1;32m     43\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtf_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     44\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mreturn_tensors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"pt\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 45\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtorch_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     46\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mreturn_tensors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"np\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     47\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumpy_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/data/data_collator.py\u001b[0m in \u001b[0;36mtorch_call\u001b[0;34m(self, examples)\u001b[0m\n\u001b[1;32m    727\u001b[0m         \u001b[0;31m# Handle dict or lists with proper padding and conversion to tensor.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    728\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexamples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMapping\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m             \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexamples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_tensors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"pt\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpad_to_multiple_of\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpad_to_multiple_of\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    730\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    731\u001b[0m             batch = {\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36mpad\u001b[0;34m(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)\u001b[0m\n\u001b[1;32m   3033\u001b[0m                 \u001b[0mbatch_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3034\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3035\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mBatchEncoding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_outputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_tensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3036\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3037\u001b[0m     def create_token_type_ids_from_sequences(\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)\u001b[0m\n\u001b[1;32m    208\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_n_sequences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_sequences\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 210\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconvert_to_tensors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtensor_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprepend_batch_axis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprepend_batch_axis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    211\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    212\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/hf/sqllama-V0/.venv/lib/python3.7/site-packages/transformers/tokenization_utils_base.py\u001b[0m in \u001b[0;36mconvert_to_tensors\u001b[0;34m(self, tensor_type, prepend_batch_axis)\u001b[0m\n\u001b[1;32m    736\u001b[0m                     \u001b[0;34mf\" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    737\u001b[0m                     \u001b[0;34m\" expected).\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 738\u001b[0;31m                 ) from e\n\u001b[0m\u001b[1;32m    739\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    740\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`prompt` in this case) have excessive nesting (inputs type `list` where type `int` is expected)."
     ]
    }
   ],
   "source": [
    "trainer = transformers.Trainer(\n",
    "    model = modad,\n",
    "    train_dataset = data,\n",
    "    args = targs,\n",
    "    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)\n",
    ")\n",
    "trainer.train(resume_from_checkpoint=False)\n",
    "model.save_pretrained('sqllama-out')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "6a381460736e8a0eabfb35eafae436ba15c06439de44e28b965ea473bd8dda90"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}