You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

2102 lines
117 KiB

{
"cells": [
{
"cell_type": "markdown",
"id": "622dfcd6",
"metadata": {},
"source": [
"# Group 09 - Automatic Sentence Completion for PubMed"
]
},
{
"cell_type": "markdown",
"id": "5e4cec3c",
"metadata": {},
"source": [
"### Authors:\n",
"- Eric Münzberg\n",
"- Shahein Enjjar\t\n",
"- Leonard Starke"
]
},
{
"cell_type": "markdown",
"id": "ee9c1d92",
"metadata": {},
"source": [
"### Firstly try to import the data modules"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "e444b44c",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" from Bio import Entrez, Medline \n",
"except:\n",
" !pip install Bio\n",
" from Bio import Entrez, Medline \n"
]
},
{
"cell_type": "markdown",
"id": "7bf15c30",
"metadata": {},
"source": [
"### define function for loading the papers from PubMed database"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "adfb256a",
"metadata": {},
"outputs": [],
"source": [
"def getPapers(myQuery, maxPapers, myEmail =\"leonard.starke@mailbox.tu-dresden.de\"):\n",
" # Get articles from PubMed\n",
" Entrez.email = myEmail\n",
" record = Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n",
" idlist = record[\"IdList\"]\n",
" print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n",
" records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n",
" return list(records)"
]
},
{
"cell_type": "markdown",
"id": "46bc6298",
"metadata": {},
"source": [
"### Verify that its working"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "39c3b352",
"metadata": {},
"outputs": [],
"source": [
"amountOfPapers = 100000"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "00481ec9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"There are 9999 records for Blood [tiab].\n"
]
}
],
"source": [
"myQuery =\"Blood [tiab]\" #query in title and abstract\n",
"maxPapers = amountOfPapers\n",
"records = getPapers(myQuery, maxPapers)"
]
},
{
"cell_type": "markdown",
"id": "b67747c6",
"metadata": {},
"source": [
"### Now extract abstracts from records"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "dcf5c217",
"metadata": {},
"outputs": [],
"source": [
"r_abstracts = []\n",
"for r in records:\n",
" if not (r.get('AB') is None):\n",
" r_abstracts.append(r['AB'])"
]
},
{
"cell_type": "markdown",
"id": "e309f6fe",
"metadata": {},
"source": [
"### Now import torch modules needed to load the data"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "c3199444",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" import torch\n",
" from torch.utils.data import Dataset \n",
" from torchtext.data import get_tokenizer\n",
"except:\n",
" !pip --default-timeout=1000 install torch\n",
" !pip --default-timeout=1000 install torchtext\n",
" import torch\n",
" from torch.utils.data import Dataset \n",
" from torchtext.data import get_tokenizer"
]
},
{
"cell_type": "markdown",
"id": "5b4007e8",
"metadata": {},
"source": [
"### Import numpy"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "daca9db6",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" import numpy as np\n",
"except:\n",
" !pip install numpy\n",
" import numpy as np\n"
]
},
{
"cell_type": "markdown",
"id": "ec1db50b",
"metadata": {},
"source": [
"### import math module"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "eb32bd79",
"metadata": {},
"outputs": [],
"source": [
"import math"
]
},
{
"cell_type": "markdown",
"id": "4df1e449",
"metadata": {},
"source": [
"### define token iterators"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3f23404d",
"metadata": {},
"outputs": [],
"source": [
"train_size = math.floor(len(r_abstracts) * 0.75)\n",
"val_size = math.floor(len(r_abstracts) * 0.125)\n",
"test_size = math.floor(len(r_abstracts) * 0.125)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "8a128d3c",
"metadata": {},
"outputs": [],
"source": [
"def train_abstract_iter():\n",
" for abstract in r_abstracts[:train_size]:\n",
" yield abstract"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "97e89986",
"metadata": {},
"outputs": [],
"source": [
"def val_abstract_iter():\n",
" for abstract in r_abstracts[(train_size + 1):(train_size + val_size)]:\n",
" yield abstract"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "0d6e89c4",
"metadata": {},
"outputs": [],
"source": [
"def test_abstract_iter():\n",
" for abstract in r_abstracts[(train_size + val_size + 1): (train_size + val_size + test_size)]:\n",
" yield abstract"
]
},
{
"cell_type": "markdown",
"id": "e5e9c5a2",
"metadata": {},
"source": [
"### define Tokenize function"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "0bdbc40a",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = get_tokenizer(\"basic_english\")\n",
"def tokenize_abstract_iter():\n",
" for abstract in r_abstracts:\n",
" yield tokenizer(abstract)"
]
},
{
"cell_type": "markdown",
"id": "37da40bb",
"metadata": {},
"source": [
"### Map every word to an id to store inside torch tensor"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "a438ab1f",
"metadata": {},
"outputs": [],
"source": [
"from torchtext.vocab import build_vocab_from_iterator\n",
"token_generator = tokenize_abstract_iter()\n",
"vocab = build_vocab_from_iterator(token_generator, specials=['<unk>'])\n",
"vocab.set_default_index(vocab['<unk>'])\n"
]
},
{
"cell_type": "markdown",
"id": "221bdc48",
"metadata": {},
"source": [
"### now convert to tensor\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "0e5bc361",
"metadata": {},
"outputs": [],
"source": [
"def data_process(tokens_iter):\n",
" \"\"\"Converts raw text into a flat Tensor.\"\"\"\n",
" data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in tokens_iter]\n",
" return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "dfd7400d",
"metadata": {},
"outputs": [],
"source": [
"train_generator = train_abstract_iter()\n",
"val_generator = val_abstract_iter()\n",
"test_generator = test_abstract_iter()\n",
"train_data = data_process(train_generator)\n",
"val_data = data_process(val_generator)\n",
"test_data = data_process(test_generator)"
]
},
{
"cell_type": "markdown",
"id": "c49a2734",
"metadata": {},
"source": [
"### check gpu"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c155ee31",
"metadata": {},
"outputs": [],
"source": [
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "79b2d248",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cuda')"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"device"
]
},
{
"cell_type": "markdown",
"id": "2150ba71",
"metadata": {},
"source": [
"### define model"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "a33d722f",
"metadata": {},
"outputs": [],
"source": [
"from typing import Tuple\n",
"\n",
"from torch import nn, Tensor\n",
"import torch.nn.functional as F\n",
"from torch.nn import TransformerEncoder, TransformerEncoderLayer\n",
"from torch.utils.data import dataset\n",
"\n",
"class TransformerModel(nn.Module):\n",
"\n",
" def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,\n",
" nlayers: int, dropout: float = 0.5):\n",
" super().__init__()\n",
" self.model_type = 'Transformer'\n",
" self.pos_encoder = PositionalEncoding(d_model, dropout)\n",
" encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)\n",
" self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)\n",
" self.encoder = nn.Embedding(ntoken, d_model)\n",
" self.d_model = d_model\n",
" self.decoder = nn.Linear(d_model, ntoken)\n",
"\n",
" self.init_weights()\n",
"\n",
" def init_weights(self) -> None:\n",
" initrange = 0.1\n",
" self.encoder.weight.data.uniform_(-initrange, initrange)\n",
" self.decoder.bias.data.zero_()\n",
" self.decoder.weight.data.uniform_(-initrange, initrange)\n",
"\n",
" def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:\n",
" \"\"\"\n",
" Args:\n",
" src: Tensor, shape [seq_len, batch_size]\n",
" src_mask: Tensor, shape [seq_len, seq_len]\n",
"\n",
" Returns:\n",
" output Tensor of shape [seq_len, batch_size, ntoken]\n",
" \"\"\"\n",
" src = self.encoder(src) * math.sqrt(self.d_model)\n",
" src = self.pos_encoder(src)\n",
" output = self.transformer_encoder(src, src_mask)\n",
" output = self.decoder(output)\n",
" return output\n",
"\n",
"\n",
"def generate_square_subsequent_mask(sz: int) -> Tensor:\n",
" \"\"\"Generates an upper-triangular matrix of -inf, with zeros on diag.\"\"\"\n",
" return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)"
]
},
{
"cell_type": "markdown",
"id": "23268efe",
"metadata": {},
"source": [
"### define pos encoder"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "c2f6d33b",
"metadata": {},
"outputs": [],
"source": [
"class PositionalEncoding(nn.Module):\n",
"\n",
" def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):\n",
" super().__init__()\n",
" self.dropout = nn.Dropout(p=dropout)\n",
"\n",
" position = torch.arange(max_len).unsqueeze(1)\n",
" div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))\n",
" pe = torch.zeros(max_len, 1, d_model)\n",
" pe[:, 0, 0::2] = torch.sin(position * div_term)\n",
" pe[:, 0, 1::2] = torch.cos(position * div_term)\n",
" self.register_buffer('pe', pe)\n",
"\n",
" def forward(self, x: Tensor) -> Tensor:\n",
" \"\"\"\n",
" Args:\n",
" x: Tensor, shape [seq_len, batch_size, embedding_dim]\n",
" \"\"\"\n",
" x = x + self.pe[:x.size(0)]\n",
" return self.dropout(x)\n"
]
},
{
"cell_type": "markdown",
"id": "306352f5",
"metadata": {},
"source": [
"### define function to create batches of data and create batches"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "9e184841",
"metadata": {},
"outputs": [],
"source": [
"def batchify(data: Tensor, bsz: int) -> Tensor:\n",
" \"\"\"Divides the data into bsz separate sequences, removing extra elements\n",
" that wouldn't cleanly fit.\n",
"\n",
" Args:\n",
" data: Tensor, shape [N]\n",
" bsz: int, batch size\n",
"\n",
" Returns:\n",
" Tensor of shape [N // bsz, bsz]\n",
" \"\"\"\n",
" seq_len = data.size(0) // bsz\n",
" data = data[:seq_len * bsz]\n",
" data = data.view(bsz, seq_len).t().contiguous()\n",
" return data.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "a4def1ac",
"metadata": {},
"outputs": [],
"source": [
"batch_size = 20\n",
"eval_batch_size = 10\n",
"train_data = batchify(train_data, batch_size) # shape [seq_len, batch_size]\n",
"val_data = batchify(val_data, eval_batch_size)\n",
"test_data = batchify(test_data, eval_batch_size)"
]
},
{
"cell_type": "markdown",
"id": "c658cb42",
"metadata": {},
"source": [
"### define function to get batch"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "4ab5b8fd",
"metadata": {},
"outputs": [],
"source": [
"bptt = 35\n",
"def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:\n",
" \"\"\"\n",
" Args:\n",
" source: Tensor, shape [full_seq_len, batch_size]\n",
" i: int\n",
"\n",
" Returns:\n",
" tuple (data, target), where data has shape [seq_len, batch_size] and\n",
" target has shape [seq_len * batch_size]\n",
" \"\"\"\n",
" seq_len = min(bptt, len(source) - 1 - i)\n",
" data = source[i:i+seq_len]\n",
" target = source[i+1:i+1+seq_len].reshape(-1)\n",
" return data, target"
]
},
{
"cell_type": "markdown",
"id": "d6392484",
"metadata": {},
"source": [
"### define parameters and init model"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "c53764da",
"metadata": {},
"outputs": [],
"source": [
"ntokens = len(vocab) # size of vocabulary\n",
"emsize = 200 # embedding dimension\n",
"d_hid = 200 # dimension of the feedforward network model in nn.TransformerEncoder\n",
"nlayers = 2 # number of nn.TransformerEncoderLayer in nn.TransformerEncoder\n",
"nhead = 2 # number of heads in nn.MultiheadAttention\n",
"dropout = 0.2 # dropout probability\n",
"model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)"
]
},
{
"cell_type": "markdown",
"id": "7fb67d72",
"metadata": {},
"source": [
"### init optimizer, loss, scheduler etc."
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "ddaa1d64",
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"import time\n",
"\n",
"criterion = nn.CrossEntropyLoss()\n",
"lr = 5.0 # learning rate\n",
"optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)"
]
},
{
"cell_type": "markdown",
"id": "dda19446",
"metadata": {},
"source": [
"### define train function"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "50ab3fb6",
"metadata": {},
"outputs": [],
"source": [
"def train(model: nn.Module) -> None:\n",
" model.train() # turn on train mode\n",
" total_loss = 0.\n",
" log_interval = 200\n",
" start_time = time.time()\n",
" src_mask = generate_square_subsequent_mask(bptt).to(device)\n",
"\n",
" num_batches = len(train_data) // bptt\n",
" for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):\n",
" data, targets = get_batch(train_data, i)\n",
" seq_len = data.size(0)\n",
" if seq_len != bptt: # only on last batch\n",
" src_mask = src_mask[:seq_len, :seq_len]\n",
" output = model(data, src_mask)\n",
" loss = criterion(output.view(-1, ntokens), targets)\n",
"\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)\n",
" optimizer.step()\n",
"\n",
" total_loss += loss.item()\n",
" if batch % log_interval == 0 and batch > 0:\n",
" lr = scheduler.get_last_lr()[0]\n",
" ms_per_batch = (time.time() - start_time) * 1000 / log_interval\n",
" cur_loss = total_loss / log_interval\n",
" ppl = math.exp(cur_loss)\n",
" print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '\n",
" f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '\n",
" f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')\n",
" total_loss = 0\n",
" start_time = time.time()"
]
},
{
"cell_type": "markdown",
"id": "9756c092",
"metadata": {},
"source": [
"### define evaluate function"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "3d179bb0",
"metadata": {},
"outputs": [],
"source": [
"def evaluate(model: nn.Module, eval_data: Tensor) -> float:\n",
" model.eval() # turn on evaluation mode\n",
" total_loss = 0.\n",
" src_mask = generate_square_subsequent_mask(bptt).to(device)\n",
" with torch.no_grad():\n",
" for i in range(0, eval_data.size(0) - 1, bptt):\n",
" data, targets = get_batch(eval_data, i)\n",
" seq_len = data.size(0)\n",
" if seq_len != bptt:\n",
" src_mask = src_mask[:seq_len, :seq_len]\n",
" output = model(data, src_mask)\n",
" output_flat = output.view(-1, ntokens)\n",
" total_loss += seq_len * criterion(output_flat, targets).item()\n",
" return total_loss / (len(eval_data) - 1)"
]
},
{
"cell_type": "markdown",
"id": "5a959f09",
"metadata": {},
"source": [
"### now we can start training the model while saving best one"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "09c4d4ce",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 1 | 200/ 3181 batches | lr 5.00 | ms/batch 101.32 | loss 9.07 | ppl 8713.01\n",
"| epoch 1 | 400/ 3181 batches | lr 5.00 | ms/batch 60.68 | loss 7.32 | ppl 1516.45\n",
"| epoch 1 | 600/ 3181 batches | lr 5.00 | ms/batch 60.86 | loss 6.78 | ppl 878.02\n",
"| epoch 1 | 800/ 3181 batches | lr 5.00 | ms/batch 60.93 | loss 6.44 | ppl 628.78\n",
"| epoch 1 | 1000/ 3181 batches | lr 5.00 | ms/batch 60.95 | loss 6.31 | ppl 551.05\n",
"| epoch 1 | 1200/ 3181 batches | lr 5.00 | ms/batch 60.99 | loss 6.19 | ppl 486.01\n",
"| epoch 1 | 1400/ 3181 batches | lr 5.00 | ms/batch 61.07 | loss 6.09 | ppl 441.65\n",
"| epoch 1 | 1600/ 3181 batches | lr 5.00 | ms/batch 61.08 | loss 6.07 | ppl 431.75\n",
"| epoch 1 | 1800/ 3181 batches | lr 5.00 | ms/batch 61.06 | loss 6.00 | ppl 403.14\n",
"| epoch 1 | 2000/ 3181 batches | lr 5.00 | ms/batch 61.16 | loss 5.91 | ppl 367.09\n",
"| epoch 1 | 2200/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.89 | ppl 359.65\n",
"| epoch 1 | 2400/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.86 | ppl 349.26\n",
"| epoch 1 | 2600/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.77 | ppl 319.99\n",
"| epoch 1 | 2800/ 3181 batches | lr 5.00 | ms/batch 61.29 | loss 5.80 | ppl 330.72\n",
"| epoch 1 | 3000/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.71 | ppl 303.17\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 1 | time: 213.85s | valid loss 5.73 | valid ppl 307.24\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 2 | 200/ 3181 batches | lr 4.75 | ms/batch 61.64 | loss 5.71 | ppl 302.70\n",
"| epoch 2 | 400/ 3181 batches | lr 4.75 | ms/batch 61.23 | loss 5.63 | ppl 279.76\n",
"| epoch 2 | 600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.63 | ppl 277.70\n",
"| epoch 2 | 800/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.56 | ppl 260.48\n",
"| epoch 2 | 1000/ 3181 batches | lr 4.75 | ms/batch 61.24 | loss 5.58 | ppl 266.00\n",
"| epoch 2 | 1200/ 3181 batches | lr 4.75 | ms/batch 61.37 | loss 5.55 | ppl 257.88\n",
"| epoch 2 | 1400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.53 | ppl 251.27\n",
"| epoch 2 | 1600/ 3181 batches | lr 4.75 | ms/batch 61.30 | loss 5.55 | ppl 255.98\n",
"| epoch 2 | 1800/ 3181 batches | lr 4.75 | ms/batch 61.29 | loss 5.53 | ppl 253.01\n",
"| epoch 2 | 2000/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.48 | ppl 238.90\n",
"| epoch 2 | 2200/ 3181 batches | lr 4.75 | ms/batch 61.35 | loss 5.46 | ppl 235.16\n",
"| epoch 2 | 2400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.46 | ppl 235.11\n",
"| epoch 2 | 2600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.40 | ppl 221.12\n",
"| epoch 2 | 2800/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.46 | ppl 234.30\n",
"| epoch 2 | 3000/ 3181 batches | lr 4.75 | ms/batch 61.28 | loss 5.37 | ppl 214.39\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 2 | time: 206.48s | valid loss 5.53 | valid ppl 252.22\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 3 | 200/ 3181 batches | lr 4.51 | ms/batch 61.62 | loss 5.42 | ppl 226.39\n",
"| epoch 3 | 400/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.36 | ppl 212.24\n",
"| epoch 3 | 600/ 3181 batches | lr 4.51 | ms/batch 61.31 | loss 5.34 | ppl 209.08\n",
"| epoch 3 | 800/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.31 | ppl 201.91\n",
"| epoch 3 | 1000/ 3181 batches | lr 4.51 | ms/batch 61.29 | loss 5.33 | ppl 207.08\n",
"| epoch 3 | 1200/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.30 | ppl 200.84\n",
"| epoch 3 | 1400/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.29 | ppl 198.48\n",
"| epoch 3 | 1600/ 3181 batches | lr 4.51 | ms/batch 61.30 | loss 5.31 | ppl 202.12\n",
"| epoch 3 | 1800/ 3181 batches | lr 4.51 | ms/batch 61.35 | loss 5.30 | ppl 200.79\n",
"| epoch 3 | 2000/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.26 | ppl 191.59\n",
"| epoch 3 | 2200/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.25 | ppl 190.89\n",
"| epoch 3 | 2400/ 3181 batches | lr 4.51 | ms/batch 61.39 | loss 5.25 | ppl 190.57\n",
"| epoch 3 | 2600/ 3181 batches | lr 4.51 | ms/batch 61.42 | loss 5.19 | ppl 180.17\n",
"| epoch 3 | 2800/ 3181 batches | lr 4.51 | ms/batch 61.38 | loss 5.26 | ppl 191.72\n",
"| epoch 3 | 3000/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.18 | ppl 177.08\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 3 | time: 206.57s | valid loss 5.44 | valid ppl 231.07\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 4 | 200/ 3181 batches | lr 4.29 | ms/batch 61.64 | loss 5.25 | ppl 190.59\n",
"| epoch 4 | 400/ 3181 batches | lr 4.29 | ms/batch 61.50 | loss 5.19 | ppl 178.85\n",
"| epoch 4 | 600/ 3181 batches | lr 4.29 | ms/batch 61.26 | loss 5.17 | ppl 176.66\n",
"| epoch 4 | 800/ 3181 batches | lr 4.29 | ms/batch 61.36 | loss 5.15 | ppl 172.78\n",
"| epoch 4 | 1000/ 3181 batches | lr 4.29 | ms/batch 61.32 | loss 5.19 | ppl 179.49\n",
"| epoch 4 | 1200/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.15 | ppl 172.65\n",
"| epoch 4 | 1400/ 3181 batches | lr 4.29 | ms/batch 61.39 | loss 5.14 | ppl 170.97\n",
"| epoch 4 | 1600/ 3181 batches | lr 4.29 | ms/batch 61.42 | loss 5.16 | ppl 174.44\n",
"| epoch 4 | 1800/ 3181 batches | lr 4.29 | ms/batch 61.40 | loss 5.16 | ppl 174.19\n",
"| epoch 4 | 2000/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.11 | ppl 166.50\n",
"| epoch 4 | 2200/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.10 | ppl 164.42\n",
"| epoch 4 | 2400/ 3181 batches | lr 4.29 | ms/batch 61.33 | loss 5.11 | ppl 165.60\n",
"| epoch 4 | 2600/ 3181 batches | lr 4.29 | ms/batch 61.41 | loss 5.06 | ppl 157.61\n",
"| epoch 4 | 2800/ 3181 batches | lr 4.29 | ms/batch 61.29 | loss 5.12 | ppl 167.69\n",
"| epoch 4 | 3000/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.05 | ppl 155.52\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 4 | time: 206.63s | valid loss 5.39 | valid ppl 218.67\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 5 | 200/ 3181 batches | lr 4.07 | ms/batch 61.64 | loss 5.13 | ppl 168.25\n",
"| epoch 5 | 400/ 3181 batches | lr 4.07 | ms/batch 61.30 | loss 5.06 | ppl 156.82\n",
"| epoch 5 | 600/ 3181 batches | lr 4.07 | ms/batch 61.38 | loss 5.04 | ppl 155.08\n",
"| epoch 5 | 800/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 5.03 | ppl 152.77\n",
"| epoch 5 | 1000/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.05 | ppl 156.69\n",
"| epoch 5 | 1200/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 5.02 | ppl 151.80\n",
"| epoch 5 | 1400/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 5.02 | ppl 151.68\n",
"| epoch 5 | 1600/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.03 | ppl 152.67\n",
"| epoch 5 | 1800/ 3181 batches | lr 4.07 | ms/batch 61.39 | loss 5.03 | ppl 152.77\n",
"| epoch 5 | 2000/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 4.99 | ppl 147.43\n",
"| epoch 5 | 2200/ 3181 batches | lr 4.07 | ms/batch 61.23 | loss 4.98 | ppl 145.22\n",
"| epoch 5 | 2400/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 4.99 | ppl 146.65\n",
"| epoch 5 | 2600/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 4.94 | ppl 140.06\n",
"| epoch 5 | 2800/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 5.01 | ppl 149.53\n",
"| epoch 5 | 3000/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 4.92 | ppl 136.69\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 5 | time: 206.57s | valid loss 5.41 | valid ppl 223.14\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 6 | 200/ 3181 batches | lr 3.87 | ms/batch 61.65 | loss 5.01 | ppl 149.89\n",
"| epoch 6 | 400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.94 | ppl 140.08\n",
"| epoch 6 | 600/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.93 | ppl 137.75\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 6 | 800/ 3181 batches | lr 3.87 | ms/batch 61.31 | loss 4.91 | ppl 135.89\n",
"| epoch 6 | 1000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.95 | ppl 141.06\n",
"| epoch 6 | 1200/ 3181 batches | lr 3.87 | ms/batch 61.38 | loss 4.90 | ppl 134.49\n",
"| epoch 6 | 1400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.91 | ppl 135.28\n",
"| epoch 6 | 1600/ 3181 batches | lr 3.87 | ms/batch 61.37 | loss 4.91 | ppl 136.26\n",
"| epoch 6 | 1800/ 3181 batches | lr 3.87 | ms/batch 61.36 | loss 4.93 | ppl 137.81\n",
"| epoch 6 | 2000/ 3181 batches | lr 3.87 | ms/batch 61.40 | loss 4.88 | ppl 131.80\n",
"| epoch 6 | 2200/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.87 | ppl 130.59\n",
"| epoch 6 | 2400/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.87 | ppl 130.95\n",
"| epoch 6 | 2600/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.83 | ppl 125.04\n",
"| epoch 6 | 2800/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.91 | ppl 135.49\n",
"| epoch 6 | 3000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.81 | ppl 122.53\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 6 | time: 206.60s | valid loss 5.37 | valid ppl 214.67\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 7 | 200/ 3181 batches | lr 3.68 | ms/batch 61.66 | loss 4.91 | ppl 135.30\n",
"| epoch 7 | 400/ 3181 batches | lr 3.68 | ms/batch 61.23 | loss 4.83 | ppl 125.76\n",
"| epoch 7 | 600/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.52\n",
"| epoch 7 | 800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.82 | ppl 123.59\n",
"| epoch 7 | 1000/ 3181 batches | lr 3.68 | ms/batch 61.35 | loss 4.85 | ppl 127.65\n",
"| epoch 7 | 1200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.81 | ppl 122.34\n",
"| epoch 7 | 1400/ 3181 batches | lr 3.68 | ms/batch 61.38 | loss 4.81 | ppl 123.22\n",
"| epoch 7 | 1600/ 3181 batches | lr 3.68 | ms/batch 61.34 | loss 4.82 | ppl 123.82\n",
"| epoch 7 | 1800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.22\n",
"| epoch 7 | 2000/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.79 | ppl 119.76\n",
"| epoch 7 | 2200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.78 | ppl 118.99\n",
"| epoch 7 | 2400/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.78 | ppl 118.59\n",
"| epoch 7 | 2600/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.73 | ppl 113.60\n",
"| epoch 7 | 2800/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.80 | ppl 122.09\n",
"| epoch 7 | 3000/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.71 | ppl 111.19\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 7 | time: 206.61s | valid loss 5.35 | valid ppl 210.55\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 8 | 200/ 3181 batches | lr 3.49 | ms/batch 61.69 | loss 4.81 | ppl 122.20\n",
"| epoch 8 | 400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.74 | ppl 114.11\n",
"| epoch 8 | 600/ 3181 batches | lr 3.49 | ms/batch 61.41 | loss 4.73 | ppl 113.82\n",
"| epoch 8 | 800/ 3181 batches | lr 3.49 | ms/batch 61.35 | loss 4.73 | ppl 113.04\n",
"| epoch 8 | 1000/ 3181 batches | lr 3.49 | ms/batch 61.46 | loss 4.75 | ppl 115.84\n",
"| epoch 8 | 1200/ 3181 batches | lr 3.49 | ms/batch 61.43 | loss 4.71 | ppl 111.58\n",
"| epoch 8 | 1400/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.72 | ppl 111.84\n",
"| epoch 8 | 1600/ 3181 batches | lr 3.49 | ms/batch 61.39 | loss 4.72 | ppl 112.52\n",
"| epoch 8 | 1800/ 3181 batches | lr 3.49 | ms/batch 61.44 | loss 4.74 | ppl 114.44\n",
"| epoch 8 | 2000/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.70 | ppl 109.63\n",
"| epoch 8 | 2200/ 3181 batches | lr 3.49 | ms/batch 61.31 | loss 4.68 | ppl 108.29\n",
"| epoch 8 | 2400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.69 | ppl 108.78\n",
"| epoch 8 | 2600/ 3181 batches | lr 3.49 | ms/batch 61.30 | loss 4.64 | ppl 103.90\n",
"| epoch 8 | 2800/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.72 | ppl 111.83\n",
"| epoch 8 | 3000/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.62 | ppl 101.24\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 8 | time: 206.60s | valid loss 5.34 | valid ppl 208.08\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 9 | 200/ 3181 batches | lr 3.32 | ms/batch 61.64 | loss 4.72 | ppl 111.95\n",
"| epoch 9 | 400/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.65 | ppl 104.38\n",
"| epoch 9 | 600/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.97\n",
"| epoch 9 | 800/ 3181 batches | lr 3.32 | ms/batch 61.32 | loss 4.64 | ppl 103.60\n",
"| epoch 9 | 1000/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.68 | ppl 107.40\n",
"| epoch 9 | 1200/ 3181 batches | lr 3.32 | ms/batch 61.39 | loss 4.62 | ppl 101.89\n",
"| epoch 9 | 1400/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.60\n",
"| epoch 9 | 1600/ 3181 batches | lr 3.32 | ms/batch 61.30 | loss 4.64 | ppl 103.54\n",
"| epoch 9 | 1800/ 3181 batches | lr 3.32 | ms/batch 61.31 | loss 4.66 | ppl 105.35\n",
"| epoch 9 | 2000/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.62 | ppl 101.24\n",
"| epoch 9 | 2200/ 3181 batches | lr 3.32 | ms/batch 61.28 | loss 4.60 | ppl 99.91\n",
"| epoch 9 | 2400/ 3181 batches | lr 3.32 | ms/batch 61.34 | loss 4.61 | ppl 100.17\n",
"| epoch 9 | 2600/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.56 | ppl 95.58\n",
"| epoch 9 | 2800/ 3181 batches | lr 3.32 | ms/batch 61.43 | loss 4.63 | ppl 102.81\n",
"| epoch 9 | 3000/ 3181 batches | lr 3.32 | ms/batch 61.38 | loss 4.54 | ppl 93.66\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 9 | time: 206.58s | valid loss 5.35 | valid ppl 209.83\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 10 | 200/ 3181 batches | lr 3.15 | ms/batch 61.64 | loss 4.64 | ppl 103.90\n",
"| epoch 10 | 400/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.88\n",
"| epoch 10 | 600/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.86\n",
"| epoch 10 | 800/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.56 | ppl 95.84\n",
"| epoch 10 | 1000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.59 | ppl 98.74\n",
"| epoch 10 | 1200/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.55 | ppl 94.35\n",
"| epoch 10 | 1400/ 3181 batches | lr 3.15 | ms/batch 61.27 | loss 4.56 | ppl 95.77\n",
"| epoch 10 | 1600/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.55 | ppl 94.76\n",
"| epoch 10 | 1800/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.99\n",
"| epoch 10 | 2000/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.54 | ppl 93.41\n",
"| epoch 10 | 2200/ 3181 batches | lr 3.15 | ms/batch 61.29 | loss 4.53 | ppl 92.30\n",
"| epoch 10 | 2400/ 3181 batches | lr 3.15 | ms/batch 61.36 | loss 4.53 | ppl 92.36\n",
"| epoch 10 | 2600/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.48 | ppl 88.41\n",
"| epoch 10 | 2800/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.78\n",
"| epoch 10 | 3000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.46 | ppl 86.56\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 10 | time: 206.54s | valid loss 5.38 | valid ppl 216.73\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 11 | 200/ 3181 batches | lr 2.99 | ms/batch 61.65 | loss 4.57 | ppl 96.07\n",
"| epoch 11 | 400/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.50 | ppl 89.75\n",
"| epoch 11 | 600/ 3181 batches | lr 2.99 | ms/batch 61.39 | loss 4.49 | ppl 88.98\n",
"| epoch 11 | 800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.49 | ppl 89.43\n",
"| epoch 11 | 1000/ 3181 batches | lr 2.99 | ms/batch 61.36 | loss 4.52 | ppl 92.09\n",
"| epoch 11 | 1200/ 3181 batches | lr 2.99 | ms/batch 61.45 | loss 4.47 | ppl 87.68\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 11 | 1400/ 3181 batches | lr 2.99 | ms/batch 61.38 | loss 4.49 | ppl 89.02\n",
"| epoch 11 | 1600/ 3181 batches | lr 2.99 | ms/batch 61.41 | loss 4.49 | ppl 89.15\n",
"| epoch 11 | 1800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.50 | ppl 90.22\n",
"| epoch 11 | 2000/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.46 | ppl 86.81\n",
"| epoch 11 | 2200/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.45 | ppl 85.80\n",
"| epoch 11 | 2400/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.46 | ppl 86.48\n",
"| epoch 11 | 2600/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.41 | ppl 82.18\n",
"| epoch 11 | 2800/ 3181 batches | lr 2.99 | ms/batch 61.40 | loss 4.48 | ppl 88.42\n",
"| epoch 11 | 3000/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.39 | ppl 80.87\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 11 | time: 206.64s | valid loss 5.39 | valid ppl 219.73\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 12 | 200/ 3181 batches | lr 2.84 | ms/batch 61.75 | loss 4.50 | ppl 89.97\n",
"| epoch 12 | 400/ 3181 batches | lr 2.84 | ms/batch 61.42 | loss 4.43 | ppl 84.04\n",
"| epoch 12 | 600/ 3181 batches | lr 2.84 | ms/batch 61.45 | loss 4.42 | ppl 83.14\n",
"| epoch 12 | 800/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.42 | ppl 83.42\n",
"| epoch 12 | 1000/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.46 | ppl 86.36\n",
"| epoch 12 | 1200/ 3181 batches | lr 2.84 | ms/batch 61.37 | loss 4.41 | ppl 82.13\n",
"| epoch 12 | 1400/ 3181 batches | lr 2.84 | ms/batch 61.32 | loss 4.42 | ppl 83.46\n",
"| epoch 12 | 1600/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.42 | ppl 82.96\n",
"| epoch 12 | 1800/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.44 | ppl 84.42\n",
"| epoch 12 | 2000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.40 | ppl 81.54\n",
"| epoch 12 | 2200/ 3181 batches | lr 2.84 | ms/batch 61.36 | loss 4.39 | ppl 80.50\n",
"| epoch 12 | 2400/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.39 | ppl 80.92\n",
"| epoch 12 | 2600/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.35 | ppl 77.30\n",
"| epoch 12 | 2800/ 3181 batches | lr 2.84 | ms/batch 61.39 | loss 4.42 | ppl 83.09\n",
"| epoch 12 | 3000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.33 | ppl 75.78\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 12 | time: 206.67s | valid loss 5.42 | valid ppl 224.91\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 13 | 200/ 3181 batches | lr 2.70 | ms/batch 61.68 | loss 4.43 | ppl 83.99\n",
"| epoch 13 | 400/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.36 | ppl 78.48\n",
"| epoch 13 | 600/ 3181 batches | lr 2.70 | ms/batch 61.33 | loss 4.35 | ppl 77.76\n",
"| epoch 13 | 800/ 3181 batches | lr 2.70 | ms/batch 61.31 | loss 4.37 | ppl 78.88\n",
"| epoch 13 | 1000/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.39 | ppl 80.64\n",
"| epoch 13 | 1200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.34 | ppl 76.95\n",
"| epoch 13 | 1400/ 3181 batches | lr 2.70 | ms/batch 61.41 | loss 4.36 | ppl 78.49\n",
"| epoch 13 | 1600/ 3181 batches | lr 2.70 | ms/batch 61.35 | loss 4.36 | ppl 77.93\n",
"| epoch 13 | 1800/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.37 | ppl 79.08\n",
"| epoch 13 | 2000/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.34 | ppl 76.68\n",
"| epoch 13 | 2200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.32 | ppl 75.17\n",
"| epoch 13 | 2400/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.33 | ppl 75.87\n",
"| epoch 13 | 2600/ 3181 batches | lr 2.70 | ms/batch 61.29 | loss 4.28 | ppl 72.20\n",
"| epoch 13 | 2800/ 3181 batches | lr 2.70 | ms/batch 61.27 | loss 4.36 | ppl 78.01\n",
"| epoch 13 | 3000/ 3181 batches | lr 2.70 | ms/batch 61.36 | loss 4.26 | ppl 70.91\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 13 | time: 206.58s | valid loss 5.42 | valid ppl 225.31\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 14 | 200/ 3181 batches | lr 2.57 | ms/batch 61.59 | loss 4.37 | ppl 79.39\n",
"| epoch 14 | 400/ 3181 batches | lr 2.57 | ms/batch 61.37 | loss 4.30 | ppl 73.94\n",
"| epoch 14 | 600/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.30 | ppl 73.50\n",
"| epoch 14 | 800/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.31 | ppl 74.12\n",
"| epoch 14 | 1000/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.33 | ppl 75.86\n",
"| epoch 14 | 1200/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.29 | ppl 72.64\n",
"| epoch 14 | 1400/ 3181 batches | lr 2.57 | ms/batch 61.39 | loss 4.31 | ppl 74.29\n",
"| epoch 14 | 1600/ 3181 batches | lr 2.57 | ms/batch 61.31 | loss 4.29 | ppl 73.17\n",
"| epoch 14 | 1800/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.31 | ppl 74.28\n",
"| epoch 14 | 2000/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.28 | ppl 71.97\n",
"| epoch 14 | 2200/ 3181 batches | lr 2.57 | ms/batch 61.44 | loss 4.26 | ppl 71.13\n",
"| epoch 14 | 2400/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.27 | ppl 71.61\n",
"| epoch 14 | 2600/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.22 | ppl 67.93\n",
"| epoch 14 | 2800/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.30 | ppl 73.68\n",
"| epoch 14 | 3000/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.21 | ppl 67.08\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 14 | time: 206.63s | valid loss 5.47 | valid ppl 236.36\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 15 | 200/ 3181 batches | lr 2.44 | ms/batch 61.66 | loss 4.32 | ppl 75.20\n",
"| epoch 15 | 400/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.25 | ppl 69.78\n",
"| epoch 15 | 600/ 3181 batches | lr 2.44 | ms/batch 61.30 | loss 4.23 | ppl 68.98\n",
"| epoch 15 | 800/ 3181 batches | lr 2.44 | ms/batch 61.34 | loss 4.25 | ppl 70.20\n",
"| epoch 15 | 1000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.28 | ppl 71.96\n",
"| epoch 15 | 1200/ 3181 batches | lr 2.44 | ms/batch 61.29 | loss 4.23 | ppl 68.62\n",
"| epoch 15 | 1400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 70.18\n",
"| epoch 15 | 1600/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.23 | ppl 68.99\n",
"| epoch 15 | 1800/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 69.87\n",
"| epoch 15 | 2000/ 3181 batches | lr 2.44 | ms/batch 61.36 | loss 4.22 | ppl 67.79\n",
"| epoch 15 | 2200/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.21 | ppl 67.21\n",
"| epoch 15 | 2400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.21 | ppl 67.61\n",
"| epoch 15 | 2600/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.15 | ppl 63.73\n",
"| epoch 15 | 2800/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.24 | ppl 69.43\n",
"| epoch 15 | 3000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.15 | ppl 63.16\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 15 | time: 206.62s | valid loss 5.47 | valid ppl 238.57\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 16 | 200/ 3181 batches | lr 2.32 | ms/batch 61.60 | loss 4.26 | ppl 71.14\n",
"| epoch 16 | 400/ 3181 batches | lr 2.32 | ms/batch 61.33 | loss 4.19 | ppl 65.93\n",
"| epoch 16 | 600/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.18 | ppl 65.22\n",
"| epoch 16 | 800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.07\n",
"| epoch 16 | 1000/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.22 | ppl 68.20\n",
"| epoch 16 | 1200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.17 | ppl 65.03\n",
"| epoch 16 | 1400/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.20 | ppl 66.72\n",
"| epoch 16 | 1600/ 3181 batches | lr 2.32 | ms/batch 61.38 | loss 4.19 | ppl 65.70\n",
"| epoch 16 | 1800/ 3181 batches | lr 2.32 | ms/batch 61.37 | loss 4.19 | ppl 66.27\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 16 | 2000/ 3181 batches | lr 2.32 | ms/batch 61.40 | loss 4.17 | ppl 64.69\n",
"| epoch 16 | 2200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.15 | ppl 63.70\n",
"| epoch 16 | 2400/ 3181 batches | lr 2.32 | ms/batch 61.36 | loss 4.17 | ppl 64.52\n",
"| epoch 16 | 2600/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.11 | ppl 60.85\n",
"| epoch 16 | 2800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.21\n",
"| epoch 16 | 3000/ 3181 batches | lr 2.32 | ms/batch 61.39 | loss 4.09 | ppl 59.76\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 16 | time: 206.63s | valid loss 5.50 | valid ppl 243.52\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 17 | 200/ 3181 batches | lr 2.20 | ms/batch 61.74 | loss 4.21 | ppl 67.46\n",
"| epoch 17 | 400/ 3181 batches | lr 2.20 | ms/batch 61.43 | loss 4.14 | ppl 62.91\n",
"| epoch 17 | 600/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 61.89\n",
"| epoch 17 | 800/ 3181 batches | lr 2.20 | ms/batch 61.44 | loss 4.15 | ppl 63.38\n",
"| epoch 17 | 1000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.17 | ppl 64.88\n",
"| epoch 17 | 1200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 62.19\n",
"| epoch 17 | 1400/ 3181 batches | lr 2.20 | ms/batch 61.38 | loss 4.15 | ppl 63.41\n",
"| epoch 17 | 1600/ 3181 batches | lr 2.20 | ms/batch 61.37 | loss 4.13 | ppl 62.14\n",
"| epoch 17 | 1800/ 3181 batches | lr 2.20 | ms/batch 61.40 | loss 4.15 | ppl 63.28\n",
"| epoch 17 | 2000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.12 | ppl 61.53\n",
"| epoch 17 | 2200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.10 | ppl 60.52\n",
"| epoch 17 | 2400/ 3181 batches | lr 2.20 | ms/batch 61.45 | loss 4.11 | ppl 61.21\n",
"| epoch 17 | 2600/ 3181 batches | lr 2.20 | ms/batch 61.33 | loss 4.06 | ppl 58.01\n",
"| epoch 17 | 2800/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.14 | ppl 62.99\n",
"| epoch 17 | 3000/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.04 | ppl 56.98\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 17 | time: 206.66s | valid loss 5.51 | valid ppl 245.93\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 18 | 200/ 3181 batches | lr 2.09 | ms/batch 61.64 | loss 4.16 | ppl 64.27\n",
"| epoch 18 | 400/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.09 | ppl 59.95\n",
"| epoch 18 | 600/ 3181 batches | lr 2.09 | ms/batch 61.38 | loss 4.08 | ppl 58.99\n",
"| epoch 18 | 800/ 3181 batches | lr 2.09 | ms/batch 61.31 | loss 4.10 | ppl 60.18\n",
"| epoch 18 | 1000/ 3181 batches | lr 2.09 | ms/batch 61.37 | loss 4.12 | ppl 61.79\n",
"| epoch 18 | 1200/ 3181 batches | lr 2.09 | ms/batch 61.42 | loss 4.08 | ppl 58.92\n",
"| epoch 18 | 1400/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.10 | ppl 60.40\n",
"| epoch 18 | 1600/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.08 | ppl 59.34\n",
"| epoch 18 | 1800/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.09 | ppl 59.74\n",
"| epoch 18 | 2000/ 3181 batches | lr 2.09 | ms/batch 61.39 | loss 4.07 | ppl 58.43\n",
"| epoch 18 | 2200/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.06 | ppl 58.17\n",
"| epoch 18 | 2400/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.07 | ppl 58.27\n",
"| epoch 18 | 2600/ 3181 batches | lr 2.09 | ms/batch 61.35 | loss 4.01 | ppl 55.01\n",
"| epoch 18 | 2800/ 3181 batches | lr 2.09 | ms/batch 61.40 | loss 4.09 | ppl 59.91\n",
"| epoch 18 | 3000/ 3181 batches | lr 2.09 | ms/batch 61.34 | loss 4.00 | ppl 54.82\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 18 | time: 206.65s | valid loss 5.52 | valid ppl 248.66\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 19 | 200/ 3181 batches | lr 1.99 | ms/batch 61.63 | loss 4.12 | ppl 61.26\n",
"| epoch 19 | 400/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.04 | ppl 57.10\n",
"| epoch 19 | 600/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.18\n",
"| epoch 19 | 800/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.06 | ppl 57.74\n",
"| epoch 19 | 1000/ 3181 batches | lr 1.99 | ms/batch 61.49 | loss 4.08 | ppl 59.10\n",
"| epoch 19 | 1200/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.27\n",
"| epoch 19 | 1400/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.06 | ppl 57.99\n",
"| epoch 19 | 1600/ 3181 batches | lr 1.99 | ms/batch 61.40 | loss 4.04 | ppl 56.78\n",
"| epoch 19 | 1800/ 3181 batches | lr 1.99 | ms/batch 61.39 | loss 4.05 | ppl 57.32\n",
"| epoch 19 | 2000/ 3181 batches | lr 1.99 | ms/batch 61.43 | loss 4.03 | ppl 56.16\n",
"| epoch 19 | 2200/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.02 | ppl 55.62\n",
"| epoch 19 | 2400/ 3181 batches | lr 1.99 | ms/batch 61.42 | loss 4.02 | ppl 55.68\n",
"| epoch 19 | 2600/ 3181 batches | lr 1.99 | ms/batch 61.38 | loss 3.97 | ppl 52.86\n",
"| epoch 19 | 2800/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.05 | ppl 57.12\n",
"| epoch 19 | 3000/ 3181 batches | lr 1.99 | ms/batch 61.31 | loss 3.95 | ppl 52.08\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 19 | time: 206.62s | valid loss 5.55 | valid ppl 257.12\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 20 | 200/ 3181 batches | lr 1.89 | ms/batch 61.70 | loss 4.07 | ppl 58.59\n",
"| epoch 20 | 400/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 4.01 | ppl 55.07\n",
"| epoch 20 | 600/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 3.99 | ppl 53.82\n",
"| epoch 20 | 800/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 4.01 | ppl 55.29\n",
"| epoch 20 | 1000/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.04 | ppl 56.83\n",
"| epoch 20 | 1200/ 3181 batches | lr 1.89 | ms/batch 61.34 | loss 3.99 | ppl 54.01\n",
"| epoch 20 | 1400/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.02 | ppl 55.48\n",
"| epoch 20 | 1600/ 3181 batches | lr 1.89 | ms/batch 61.33 | loss 4.00 | ppl 54.51\n",
"| epoch 20 | 1800/ 3181 batches | lr 1.89 | ms/batch 61.41 | loss 4.01 | ppl 55.02\n",
"| epoch 20 | 2000/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 3.99 | ppl 54.00\n",
"| epoch 20 | 2200/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.97 | ppl 53.23\n",
"| epoch 20 | 2400/ 3181 batches | lr 1.89 | ms/batch 61.29 | loss 3.98 | ppl 53.61\n",
"| epoch 20 | 2600/ 3181 batches | lr 1.89 | ms/batch 61.30 | loss 3.92 | ppl 50.62\n",
"| epoch 20 | 2800/ 3181 batches | lr 1.89 | ms/batch 61.32 | loss 4.01 | ppl 55.04\n",
"| epoch 20 | 3000/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.92 | ppl 50.18\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 20 | time: 206.60s | valid loss 5.61 | valid ppl 273.93\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 21 | 200/ 3181 batches | lr 1.79 | ms/batch 61.65 | loss 4.03 | ppl 56.37\n",
"| epoch 21 | 400/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.96 | ppl 52.65\n",
"| epoch 21 | 600/ 3181 batches | lr 1.79 | ms/batch 61.43 | loss 3.94 | ppl 51.53\n",
"| epoch 21 | 800/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.97 | ppl 52.82\n",
"| epoch 21 | 1000/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.99 | ppl 54.28\n",
"| epoch 21 | 1200/ 3181 batches | lr 1.79 | ms/batch 61.31 | loss 3.95 | ppl 51.85\n",
"| epoch 21 | 1400/ 3181 batches | lr 1.79 | ms/batch 61.33 | loss 3.98 | ppl 53.51\n",
"| epoch 21 | 1600/ 3181 batches | lr 1.79 | ms/batch 61.37 | loss 3.96 | ppl 52.23\n",
"| epoch 21 | 1800/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.97 | ppl 52.95\n",
"| epoch 21 | 2000/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.95 | ppl 51.71\n",
"| epoch 21 | 2200/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.94 | ppl 51.19\n",
"| epoch 21 | 2400/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.94 | ppl 51.57\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 21 | 2600/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.88 | ppl 48.60\n",
"| epoch 21 | 2800/ 3181 batches | lr 1.79 | ms/batch 61.40 | loss 3.97 | ppl 52.99\n",
"| epoch 21 | 3000/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.87 | ppl 48.17\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 21 | time: 206.61s | valid loss 5.61 | valid ppl 273.11\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 22 | 200/ 3181 batches | lr 1.70 | ms/batch 61.70 | loss 3.99 | ppl 54.02\n",
"| epoch 22 | 400/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.92 | ppl 50.52\n",
"| epoch 22 | 600/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.90 | ppl 49.61\n",
"| epoch 22 | 800/ 3181 batches | lr 1.70 | ms/batch 61.33 | loss 3.93 | ppl 51.15\n",
"| epoch 22 | 1000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.96 | ppl 52.34\n",
"| epoch 22 | 1200/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 50.10\n",
"| epoch 22 | 1400/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.94 | ppl 51.37\n",
"| epoch 22 | 1600/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.92 | ppl 50.25\n",
"| epoch 22 | 1800/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.93 | ppl 50.89\n",
"| epoch 22 | 2000/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 49.70\n",
"| epoch 22 | 2200/ 3181 batches | lr 1.70 | ms/batch 61.43 | loss 3.90 | ppl 49.28\n",
"| epoch 22 | 2400/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.90 | ppl 49.46\n",
"| epoch 22 | 2600/ 3181 batches | lr 1.70 | ms/batch 61.41 | loss 3.84 | ppl 46.62\n",
"| epoch 22 | 2800/ 3181 batches | lr 1.70 | ms/batch 61.38 | loss 3.93 | ppl 50.75\n",
"| epoch 22 | 3000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.83 | ppl 46.27\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 22 | time: 206.60s | valid loss 5.61 | valid ppl 273.57\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 23 | 200/ 3181 batches | lr 1.62 | ms/batch 61.61 | loss 3.96 | ppl 52.31\n",
"| epoch 23 | 400/ 3181 batches | lr 1.62 | ms/batch 61.32 | loss 3.88 | ppl 48.56\n",
"| epoch 23 | 600/ 3181 batches | lr 1.62 | ms/batch 61.35 | loss 3.86 | ppl 47.70\n",
"| epoch 23 | 800/ 3181 batches | lr 1.62 | ms/batch 61.31 | loss 3.90 | ppl 49.41\n",
"| epoch 23 | 1000/ 3181 batches | lr 1.62 | ms/batch 61.41 | loss 3.92 | ppl 50.42\n",
"| epoch 23 | 1200/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.88 | ppl 48.43\n",
"| epoch 23 | 1400/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.91 | ppl 49.85\n",
"| epoch 23 | 1600/ 3181 batches | lr 1.62 | ms/batch 61.30 | loss 3.88 | ppl 48.37\n",
"| epoch 23 | 1800/ 3181 batches | lr 1.62 | ms/batch 61.34 | loss 3.89 | ppl 49.03\n",
"| epoch 23 | 2000/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.87 | ppl 48.12\n",
"| epoch 23 | 2200/ 3181 batches | lr 1.62 | ms/batch 61.36 | loss 3.86 | ppl 47.57\n",
"| epoch 23 | 2400/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.87 | ppl 47.73\n",
"| epoch 23 | 2600/ 3181 batches | lr 1.62 | ms/batch 61.29 | loss 3.81 | ppl 45.15\n",
"| epoch 23 | 2800/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.90 | ppl 49.58\n",
"| epoch 23 | 3000/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.80 | ppl 44.75\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 23 | time: 206.56s | valid loss 5.64 | valid ppl 281.95\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 24 | 200/ 3181 batches | lr 1.54 | ms/batch 61.67 | loss 3.92 | ppl 50.35\n",
"| epoch 24 | 400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.85 | ppl 47.01\n",
"| epoch 24 | 600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.84 | ppl 46.34\n",
"| epoch 24 | 800/ 3181 batches | lr 1.54 | ms/batch 61.43 | loss 3.87 | ppl 47.90\n",
"| epoch 24 | 1000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.89 | ppl 48.81\n",
"| epoch 24 | 1200/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.83\n",
"| epoch 24 | 1400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.87 | ppl 48.14\n",
"| epoch 24 | 1600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.96\n",
"| epoch 24 | 1800/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.86 | ppl 47.49\n",
"| epoch 24 | 2000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.84 | ppl 46.41\n",
"| epoch 24 | 2200/ 3181 batches | lr 1.54 | ms/batch 61.31 | loss 3.82 | ppl 45.83\n",
"| epoch 24 | 2400/ 3181 batches | lr 1.54 | ms/batch 61.35 | loss 3.83 | ppl 46.13\n",
"| epoch 24 | 2600/ 3181 batches | lr 1.54 | ms/batch 61.36 | loss 3.77 | ppl 43.56\n",
"| epoch 24 | 2800/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.86 | ppl 47.52\n",
"| epoch 24 | 3000/ 3181 batches | lr 1.54 | ms/batch 61.29 | loss 3.77 | ppl 43.23\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 24 | time: 206.67s | valid loss 5.67 | valid ppl 290.25\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 25 | 200/ 3181 batches | lr 1.46 | ms/batch 61.66 | loss 3.89 | ppl 48.76\n",
"| epoch 25 | 400/ 3181 batches | lr 1.46 | ms/batch 61.41 | loss 3.82 | ppl 45.61\n",
"| epoch 25 | 600/ 3181 batches | lr 1.46 | ms/batch 61.44 | loss 3.80 | ppl 44.79\n",
"| epoch 25 | 800/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.83 | ppl 46.26\n",
"| epoch 25 | 1000/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.86 | ppl 47.26\n",
"| epoch 25 | 1200/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.81 | ppl 45.19\n",
"| epoch 25 | 1400/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.84 | ppl 46.37\n",
"| epoch 25 | 1600/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.82 | ppl 45.47\n",
"| epoch 25 | 1800/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.83 | ppl 45.88\n",
"| epoch 25 | 2000/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.81 | ppl 45.08\n",
"| epoch 25 | 2200/ 3181 batches | lr 1.46 | ms/batch 61.43 | loss 3.80 | ppl 44.56\n",
"| epoch 25 | 2400/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.80 | ppl 44.78\n",
"| epoch 25 | 2600/ 3181 batches | lr 1.46 | ms/batch 61.34 | loss 3.74 | ppl 42.12\n",
"| epoch 25 | 2800/ 3181 batches | lr 1.46 | ms/batch 61.31 | loss 3.83 | ppl 45.90\n",
"| epoch 25 | 3000/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.74 | ppl 42.12\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 25 | time: 206.63s | valid loss 5.65 | valid ppl 283.82\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 26 | 200/ 3181 batches | lr 1.39 | ms/batch 61.69 | loss 3.86 | ppl 47.46\n",
"| epoch 26 | 400/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.79 | ppl 44.15\n",
"| epoch 26 | 600/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.77 | ppl 43.51\n",
"| epoch 26 | 800/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.08\n",
"| epoch 26 | 1000/ 3181 batches | lr 1.39 | ms/batch 61.42 | loss 3.82 | ppl 45.75\n",
"| epoch 26 | 1200/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.78 | ppl 43.98\n",
"| epoch 26 | 1400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.28\n",
"| epoch 26 | 1600/ 3181 batches | lr 1.39 | ms/batch 61.28 | loss 3.78 | ppl 43.92\n",
"| epoch 26 | 1800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.57\n",
"| epoch 26 | 2000/ 3181 batches | lr 1.39 | ms/batch 61.38 | loss 3.77 | ppl 43.55\n",
"| epoch 26 | 2200/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.77 | ppl 43.27\n",
"| epoch 26 | 2400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.77 | ppl 43.43\n",
"| epoch 26 | 2600/ 3181 batches | lr 1.39 | ms/batch 61.41 | loss 3.71 | ppl 40.92\n",
"| epoch 26 | 2800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.73\n",
"| epoch 26 | 3000/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.71 | ppl 40.74\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 26 | time: 206.67s | valid loss 5.69 | valid ppl 294.72\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 27 | 200/ 3181 batches | lr 1.32 | ms/batch 61.61 | loss 3.83 | ppl 46.08\n",
"| epoch 27 | 400/ 3181 batches | lr 1.32 | ms/batch 61.34 | loss 3.76 | ppl 42.90\n",
"| epoch 27 | 600/ 3181 batches | lr 1.32 | ms/batch 61.37 | loss 3.73 | ppl 41.77\n",
"| epoch 27 | 800/ 3181 batches | lr 1.32 | ms/batch 61.39 | loss 3.78 | ppl 43.61\n",
"| epoch 27 | 1000/ 3181 batches | lr 1.32 | ms/batch 61.38 | loss 3.80 | ppl 44.57\n",
"| epoch 27 | 1200/ 3181 batches | lr 1.32 | ms/batch 61.31 | loss 3.75 | ppl 42.51\n",
"| epoch 27 | 1400/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.79 | ppl 44.18\n",
"| epoch 27 | 1600/ 3181 batches | lr 1.32 | ms/batch 61.30 | loss 3.76 | ppl 42.82\n",
"| epoch 27 | 1800/ 3181 batches | lr 1.32 | ms/batch 61.41 | loss 3.76 | ppl 42.95\n",
"| epoch 27 | 2000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.75 | ppl 42.42\n",
"| epoch 27 | 2200/ 3181 batches | lr 1.32 | ms/batch 61.35 | loss 3.74 | ppl 42.12\n",
"| epoch 27 | 2400/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.74 | ppl 42.31\n",
"| epoch 27 | 2600/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.68 | ppl 39.83\n",
"| epoch 27 | 2800/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.77 | ppl 43.28\n",
"| epoch 27 | 3000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.68 | ppl 39.55\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 27 | time: 206.56s | valid loss 5.75 | valid ppl 315.59\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 28 | 200/ 3181 batches | lr 1.25 | ms/batch 61.70 | loss 3.80 | ppl 44.70\n",
"| epoch 28 | 400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.73 | ppl 41.81\n",
"| epoch 28 | 600/ 3181 batches | lr 1.25 | ms/batch 61.43 | loss 3.71 | ppl 40.76\n",
"| epoch 28 | 800/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.75 | ppl 42.56\n",
"| epoch 28 | 1000/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.77 | ppl 43.35\n",
"| epoch 28 | 1200/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.72 | ppl 41.32\n",
"| epoch 28 | 1400/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.75 | ppl 42.65\n",
"| epoch 28 | 1600/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.73 | ppl 41.67\n",
"| epoch 28 | 1800/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.73 | ppl 41.85\n",
"| epoch 28 | 2000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.72 | ppl 41.24\n",
"| epoch 28 | 2200/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.71 | ppl 40.83\n",
"| epoch 28 | 2400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.72 | ppl 41.20\n",
"| epoch 28 | 2600/ 3181 batches | lr 1.25 | ms/batch 61.39 | loss 3.66 | ppl 38.72\n",
"| epoch 28 | 2800/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.74 | ppl 42.13\n",
"| epoch 28 | 3000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.65 | ppl 38.37\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 28 | time: 206.71s | valid loss 5.77 | valid ppl 320.59\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 29 | 200/ 3181 batches | lr 1.19 | ms/batch 61.73 | loss 3.77 | ppl 43.52\n",
"| epoch 29 | 400/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.70 | ppl 40.52\n",
"| epoch 29 | 600/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.68 | ppl 39.58\n",
"| epoch 29 | 800/ 3181 batches | lr 1.19 | ms/batch 61.48 | loss 3.72 | ppl 41.41\n",
"| epoch 29 | 1000/ 3181 batches | lr 1.19 | ms/batch 61.35 | loss 3.74 | ppl 42.29\n",
"| epoch 29 | 1200/ 3181 batches | lr 1.19 | ms/batch 61.36 | loss 3.70 | ppl 40.36\n",
"| epoch 29 | 1400/ 3181 batches | lr 1.19 | ms/batch 61.37 | loss 3.73 | ppl 41.64\n",
"| epoch 29 | 1600/ 3181 batches | lr 1.19 | ms/batch 61.40 | loss 3.71 | ppl 40.66\n",
"| epoch 29 | 1800/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.72 | ppl 41.08\n",
"| epoch 29 | 2000/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.69 | ppl 40.20\n",
"| epoch 29 | 2200/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.68 | ppl 39.80\n",
"| epoch 29 | 2400/ 3181 batches | lr 1.19 | ms/batch 61.45 | loss 3.70 | ppl 40.25\n",
"| epoch 29 | 2600/ 3181 batches | lr 1.19 | ms/batch 61.47 | loss 3.63 | ppl 37.79\n",
"| epoch 29 | 2800/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.72 | ppl 41.21\n",
"| epoch 29 | 3000/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.62 | ppl 37.43\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 29 | time: 206.79s | valid loss 5.81 | valid ppl 332.16\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 30 | 200/ 3181 batches | lr 1.13 | ms/batch 61.74 | loss 3.74 | ppl 42.22\n",
"| epoch 30 | 400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.68 | ppl 39.52\n",
"| epoch 30 | 600/ 3181 batches | lr 1.13 | ms/batch 61.41 | loss 3.65 | ppl 38.62\n",
"| epoch 30 | 800/ 3181 batches | lr 1.13 | ms/batch 61.39 | loss 3.70 | ppl 40.47\n",
"| epoch 30 | 1000/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.72 | ppl 41.14\n",
"| epoch 30 | 1200/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.67 | ppl 39.41\n",
"| epoch 30 | 1400/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.71 | ppl 40.66\n",
"| epoch 30 | 1600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.68 | ppl 39.62\n",
"| epoch 30 | 1800/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.69 | ppl 39.97\n",
"| epoch 30 | 2000/ 3181 batches | lr 1.13 | ms/batch 61.36 | loss 3.67 | ppl 39.34\n",
"| epoch 30 | 2200/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.66 | ppl 38.99\n",
"| epoch 30 | 2400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.66 | ppl 39.01\n",
"| epoch 30 | 2600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.61 | ppl 36.84\n",
"| epoch 30 | 2800/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.69 | ppl 40.20\n",
"| epoch 30 | 3000/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.60 | ppl 36.54\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 30 | time: 206.80s | valid loss 5.75 | valid ppl 313.98\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 31 | 200/ 3181 batches | lr 1.07 | ms/batch 61.74 | loss 3.72 | ppl 41.43\n",
"| epoch 31 | 400/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.65 | ppl 38.65\n",
"| epoch 31 | 600/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.63 | ppl 37.82\n",
"| epoch 31 | 800/ 3181 batches | lr 1.07 | ms/batch 61.40 | loss 3.68 | ppl 39.51\n",
"| epoch 31 | 1000/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.69 | ppl 40.17\n",
"| epoch 31 | 1200/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.65 | ppl 38.53\n",
"| epoch 31 | 1400/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.69 | ppl 39.93\n",
"| epoch 31 | 1600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.66 | ppl 38.77\n",
"| epoch 31 | 1800/ 3181 batches | lr 1.07 | ms/batch 61.39 | loss 3.67 | ppl 39.17\n",
"| epoch 31 | 2000/ 3181 batches | lr 1.07 | ms/batch 61.49 | loss 3.65 | ppl 38.48\n",
"| epoch 31 | 2200/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.63 | ppl 37.78\n",
"| epoch 31 | 2400/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.65 | ppl 38.35\n",
"| epoch 31 | 2600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.59 | ppl 36.09\n",
"| epoch 31 | 2800/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.67 | ppl 39.29\n",
"| epoch 31 | 3000/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.57 | ppl 35.60\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 31 | time: 206.68s | valid loss 5.82 | valid ppl 335.54\n",
"-----------------------------------------------------------------------------------------\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 32 | 200/ 3181 batches | lr 1.02 | ms/batch 61.77 | loss 3.70 | ppl 40.52\n",
"| epoch 32 | 400/ 3181 batches | lr 1.02 | ms/batch 61.35 | loss 3.64 | ppl 37.96\n",
"| epoch 32 | 600/ 3181 batches | lr 1.02 | ms/batch 61.39 | loss 3.61 | ppl 36.88\n",
"| epoch 32 | 800/ 3181 batches | lr 1.02 | ms/batch 61.41 | loss 3.66 | ppl 38.80\n",
"| epoch 32 | 1000/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.67 | ppl 39.32\n",
"| epoch 32 | 1200/ 3181 batches | lr 1.02 | ms/batch 61.42 | loss 3.63 | ppl 37.59\n",
"| epoch 32 | 1400/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.66 | ppl 38.96\n",
"| epoch 32 | 1600/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.64 | ppl 38.12\n",
"| epoch 32 | 1800/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.64 | ppl 38.28\n",
"| epoch 32 | 2000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.63 | ppl 37.70\n",
"| epoch 32 | 2200/ 3181 batches | lr 1.02 | ms/batch 61.37 | loss 3.61 | ppl 37.07\n",
"| epoch 32 | 2400/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.62 | ppl 37.39\n",
"| epoch 32 | 2600/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.56 | ppl 35.19\n",
"| epoch 32 | 2800/ 3181 batches | lr 1.02 | ms/batch 61.43 | loss 3.65 | ppl 38.29\n",
"| epoch 32 | 3000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.55 | ppl 34.89\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 32 | time: 206.72s | valid loss 5.81 | valid ppl 333.52\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 33 | 200/ 3181 batches | lr 0.97 | ms/batch 61.68 | loss 3.68 | ppl 39.71\n",
"| epoch 33 | 400/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 37.00\n",
"| epoch 33 | 600/ 3181 batches | lr 0.97 | ms/batch 61.36 | loss 3.59 | ppl 36.33\n",
"| epoch 33 | 800/ 3181 batches | lr 0.97 | ms/batch 61.35 | loss 3.64 | ppl 38.02\n",
"| epoch 33 | 1000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.65 | ppl 38.54\n",
"| epoch 33 | 1200/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.61 | ppl 37.12\n",
"| epoch 33 | 1400/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.64 | ppl 38.27\n",
"| epoch 33 | 1600/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.26\n",
"| epoch 33 | 1800/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.45\n",
"| epoch 33 | 2000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 36.92\n",
"| epoch 33 | 2200/ 3181 batches | lr 0.97 | ms/batch 61.37 | loss 3.59 | ppl 36.34\n",
"| epoch 33 | 2400/ 3181 batches | lr 0.97 | ms/batch 61.41 | loss 3.60 | ppl 36.73\n",
"| epoch 33 | 2600/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.54 | ppl 34.54\n",
"| epoch 33 | 2800/ 3181 batches | lr 0.97 | ms/batch 61.39 | loss 3.62 | ppl 37.42\n",
"| epoch 33 | 3000/ 3181 batches | lr 0.97 | ms/batch 61.45 | loss 3.53 | ppl 34.28\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 33 | time: 206.79s | valid loss 5.84 | valid ppl 345.08\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 34 | 200/ 3181 batches | lr 0.92 | ms/batch 61.72 | loss 3.66 | ppl 38.95\n",
"| epoch 34 | 400/ 3181 batches | lr 0.92 | ms/batch 61.44 | loss 3.60 | ppl 36.53\n",
"| epoch 34 | 600/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.57 | ppl 35.49\n",
"| epoch 34 | 800/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.62 | ppl 37.33\n",
"| epoch 34 | 1000/ 3181 batches | lr 0.92 | ms/batch 61.39 | loss 3.63 | ppl 37.79\n",
"| epoch 34 | 1200/ 3181 batches | lr 0.92 | ms/batch 61.34 | loss 3.59 | ppl 36.16\n",
"| epoch 34 | 1400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.63 | ppl 37.62\n",
"| epoch 34 | 1600/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.60 | ppl 36.58\n",
"| epoch 34 | 1800/ 3181 batches | lr 0.92 | ms/batch 61.37 | loss 3.60 | ppl 36.77\n",
"| epoch 34 | 2000/ 3181 batches | lr 0.92 | ms/batch 61.36 | loss 3.59 | ppl 36.25\n",
"| epoch 34 | 2200/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.58 | ppl 35.76\n",
"| epoch 34 | 2400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.59 | ppl 36.19\n",
"| epoch 34 | 2600/ 3181 batches | lr 0.92 | ms/batch 61.40 | loss 3.52 | ppl 33.87\n",
"| epoch 34 | 2800/ 3181 batches | lr 0.92 | ms/batch 61.31 | loss 3.61 | ppl 36.90\n",
"| epoch 34 | 3000/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.52 | ppl 33.68\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 34 | time: 206.73s | valid loss 5.83 | valid ppl 341.59\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 35 | 200/ 3181 batches | lr 0.87 | ms/batch 61.62 | loss 3.64 | ppl 38.22\n",
"| epoch 35 | 400/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.82\n",
"| epoch 35 | 600/ 3181 batches | lr 0.87 | ms/batch 61.36 | loss 3.55 | ppl 34.84\n",
"| epoch 35 | 800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.61 | ppl 36.83\n",
"| epoch 35 | 1000/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.62 | ppl 37.16\n",
"| epoch 35 | 1200/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.57 | ppl 35.54\n",
"| epoch 35 | 1400/ 3181 batches | lr 0.87 | ms/batch 61.35 | loss 3.60 | ppl 36.70\n",
"| epoch 35 | 1600/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.58 | ppl 35.97\n",
"| epoch 35 | 1800/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.94\n",
"| epoch 35 | 2000/ 3181 batches | lr 0.87 | ms/batch 61.48 | loss 3.57 | ppl 35.45\n",
"| epoch 35 | 2200/ 3181 batches | lr 0.87 | ms/batch 61.41 | loss 3.56 | ppl 35.07\n",
"| epoch 35 | 2400/ 3181 batches | lr 0.87 | ms/batch 61.37 | loss 3.57 | ppl 35.36\n",
"| epoch 35 | 2600/ 3181 batches | lr 0.87 | ms/batch 61.32 | loss 3.51 | ppl 33.39\n",
"| epoch 35 | 2800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.59 | ppl 36.19\n",
"| epoch 35 | 3000/ 3181 batches | lr 0.87 | ms/batch 61.39 | loss 3.50 | ppl 33.10\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 35 | time: 206.71s | valid loss 5.84 | valid ppl 345.09\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 36 | 200/ 3181 batches | lr 0.83 | ms/batch 61.71 | loss 3.63 | ppl 37.53\n",
"| epoch 36 | 400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.56 | ppl 35.03\n",
"| epoch 36 | 600/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.53 | ppl 34.18\n",
"| epoch 36 | 800/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.59 | ppl 36.15\n",
"| epoch 36 | 1000/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.60 | ppl 36.47\n",
"| epoch 36 | 1200/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.56 | ppl 35.02\n",
"| epoch 36 | 1400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.59 | ppl 36.26\n",
"| epoch 36 | 1600/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.41\n",
"| epoch 36 | 1800/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.57 | ppl 35.56\n",
"| epoch 36 | 2000/ 3181 batches | lr 0.83 | ms/batch 61.45 | loss 3.55 | ppl 34.90\n",
"| epoch 36 | 2200/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.54 | ppl 34.47\n",
"| epoch 36 | 2400/ 3181 batches | lr 0.83 | ms/batch 61.35 | loss 3.55 | ppl 34.76\n",
"| epoch 36 | 2600/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.49 | ppl 32.79\n",
"| epoch 36 | 2800/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.60\n",
"| epoch 36 | 3000/ 3181 batches | lr 0.83 | ms/batch 61.44 | loss 3.48 | ppl 32.46\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 36 | time: 206.71s | valid loss 5.83 | valid ppl 339.42\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 37 | 200/ 3181 batches | lr 0.79 | ms/batch 61.75 | loss 3.61 | ppl 37.13\n",
"| epoch 37 | 400/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.54 | ppl 34.55\n",
"| epoch 37 | 600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.51 | ppl 33.58\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 37 | 800/ 3181 batches | lr 0.79 | ms/batch 61.32 | loss 3.57 | ppl 35.62\n",
"| epoch 37 | 1000/ 3181 batches | lr 0.79 | ms/batch 61.30 | loss 3.58 | ppl 35.97\n",
"| epoch 37 | 1200/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.54 | ppl 34.53\n",
"| epoch 37 | 1400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.58 | ppl 35.77\n",
"| epoch 37 | 1600/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.55 | ppl 34.77\n",
"| epoch 37 | 1800/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.55 | ppl 34.93\n",
"| epoch 37 | 2000/ 3181 batches | lr 0.79 | ms/batch 61.41 | loss 3.53 | ppl 34.29\n",
"| epoch 37 | 2200/ 3181 batches | lr 0.79 | ms/batch 61.45 | loss 3.53 | ppl 34.02\n",
"| epoch 37 | 2400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.53 | ppl 34.29\n",
"| epoch 37 | 2600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.48 | ppl 32.30\n",
"| epoch 37 | 2800/ 3181 batches | lr 0.79 | ms/batch 61.42 | loss 3.56 | ppl 35.19\n",
"| epoch 37 | 3000/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.47 | ppl 32.07\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 37 | time: 206.68s | valid loss 5.87 | valid ppl 352.86\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 38 | 200/ 3181 batches | lr 0.75 | ms/batch 61.67 | loss 3.59 | ppl 36.37\n",
"| epoch 38 | 400/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.53 | ppl 34.01\n",
"| epoch 38 | 600/ 3181 batches | lr 0.75 | ms/batch 61.39 | loss 3.50 | ppl 33.04\n",
"| epoch 38 | 800/ 3181 batches | lr 0.75 | ms/batch 61.38 | loss 3.56 | ppl 35.12\n",
"| epoch 38 | 1000/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.57 | ppl 35.51\n",
"| epoch 38 | 1200/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n",
"| epoch 38 | 1400/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.56 | ppl 35.27\n",
"| epoch 38 | 1600/ 3181 batches | lr 0.75 | ms/batch 61.46 | loss 3.54 | ppl 34.43\n",
"| epoch 38 | 1800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.47\n",
"| epoch 38 | 2000/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n",
"| epoch 38 | 2200/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.52 | ppl 33.72\n",
"| epoch 38 | 2400/ 3181 batches | lr 0.75 | ms/batch 61.41 | loss 3.52 | ppl 33.79\n",
"| epoch 38 | 2600/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.46 | ppl 31.89\n",
"| epoch 38 | 2800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.41\n",
"| epoch 38 | 3000/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.45 | ppl 31.48\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 38 | time: 206.80s | valid loss 5.88 | valid ppl 358.16\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 39 | 200/ 3181 batches | lr 0.71 | ms/batch 61.71 | loss 3.58 | ppl 36.05\n",
"| epoch 39 | 400/ 3181 batches | lr 0.71 | ms/batch 61.40 | loss 3.52 | ppl 33.71\n",
"| epoch 39 | 600/ 3181 batches | lr 0.71 | ms/batch 61.44 | loss 3.49 | ppl 32.73\n",
"| epoch 39 | 800/ 3181 batches | lr 0.71 | ms/batch 61.34 | loss 3.54 | ppl 34.55\n",
"| epoch 39 | 1000/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.56 | ppl 35.03\n",
"| epoch 39 | 1200/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.38\n",
"| epoch 39 | 1400/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.55 | ppl 34.77\n",
"| epoch 39 | 1600/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.52 | ppl 33.79\n",
"| epoch 39 | 1800/ 3181 batches | lr 0.71 | ms/batch 61.46 | loss 3.52 | ppl 33.85\n",
"| epoch 39 | 2000/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.32\n",
"| epoch 39 | 2200/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.50 | ppl 33.21\n",
"| epoch 39 | 2400/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.51 | ppl 33.39\n",
"| epoch 39 | 2600/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.45 | ppl 31.48\n",
"| epoch 39 | 2800/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.53 | ppl 34.01\n",
"| epoch 39 | 3000/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.43 | ppl 30.93\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 39 | time: 206.75s | valid loss 5.92 | valid ppl 370.78\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 40 | 200/ 3181 batches | lr 0.68 | ms/batch 61.74 | loss 3.57 | ppl 35.52\n",
"| epoch 40 | 400/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.50 | ppl 33.05\n",
"| epoch 40 | 600/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.47 | ppl 32.19\n",
"| epoch 40 | 800/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.53 | ppl 34.18\n",
"| epoch 40 | 1000/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.54 | ppl 34.37\n",
"| epoch 40 | 1200/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.50 | ppl 33.04\n",
"| epoch 40 | 1400/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.53 | ppl 34.21\n",
"| epoch 40 | 1600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.51 | ppl 33.31\n",
"| epoch 40 | 1800/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.51 | ppl 33.39\n",
"| epoch 40 | 2000/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.50 | ppl 32.98\n",
"| epoch 40 | 2200/ 3181 batches | lr 0.68 | ms/batch 61.40 | loss 3.49 | ppl 32.67\n",
"| epoch 40 | 2400/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.49 | ppl 32.91\n",
"| epoch 40 | 2600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.44 | ppl 31.10\n",
"| epoch 40 | 2800/ 3181 batches | lr 0.68 | ms/batch 61.34 | loss 3.51 | ppl 33.46\n",
"| epoch 40 | 3000/ 3181 batches | lr 0.68 | ms/batch 61.28 | loss 3.42 | ppl 30.62\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 40 | time: 206.76s | valid loss 5.93 | valid ppl 376.29\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 41 | 200/ 3181 batches | lr 0.64 | ms/batch 61.65 | loss 3.56 | ppl 35.06\n",
"| epoch 41 | 400/ 3181 batches | lr 0.64 | ms/batch 61.47 | loss 3.49 | ppl 32.70\n",
"| epoch 41 | 600/ 3181 batches | lr 0.64 | ms/batch 61.39 | loss 3.45 | ppl 31.62\n",
"| epoch 41 | 800/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.52 | ppl 33.65\n",
"| epoch 41 | 1000/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.53 | ppl 33.98\n",
"| epoch 41 | 1200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.49 | ppl 32.75\n",
"| epoch 41 | 1400/ 3181 batches | lr 0.64 | ms/batch 61.41 | loss 3.52 | ppl 33.71\n",
"| epoch 41 | 1600/ 3181 batches | lr 0.64 | ms/batch 61.45 | loss 3.50 | ppl 33.06\n",
"| epoch 41 | 1800/ 3181 batches | lr 0.64 | ms/batch 61.35 | loss 3.50 | ppl 33.27\n",
"| epoch 41 | 2000/ 3181 batches | lr 0.64 | ms/batch 61.43 | loss 3.49 | ppl 32.72\n",
"| epoch 41 | 2200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.47 | ppl 32.12\n",
"| epoch 41 | 2400/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.48 | ppl 32.54\n",
"| epoch 41 | 2600/ 3181 batches | lr 0.64 | ms/batch 61.44 | loss 3.43 | ppl 30.92\n",
"| epoch 41 | 2800/ 3181 batches | lr 0.64 | ms/batch 61.37 | loss 3.50 | ppl 33.15\n",
"| epoch 41 | 3000/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.41 | ppl 30.36\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 41 | time: 206.74s | valid loss 5.91 | valid ppl 369.07\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 42 | 200/ 3181 batches | lr 0.61 | ms/batch 61.72 | loss 3.55 | ppl 34.66\n",
"| epoch 42 | 400/ 3181 batches | lr 0.61 | ms/batch 61.40 | loss 3.48 | ppl 32.31\n",
"| epoch 42 | 600/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.45 | ppl 31.42\n",
"| epoch 42 | 800/ 3181 batches | lr 0.61 | ms/batch 61.36 | loss 3.51 | ppl 33.32\n",
"| epoch 42 | 1000/ 3181 batches | lr 0.61 | ms/batch 61.42 | loss 3.52 | ppl 33.79\n",
"| epoch 42 | 1200/ 3181 batches | lr 0.61 | ms/batch 61.43 | loss 3.47 | ppl 32.12\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 42 | 1400/ 3181 batches | lr 0.61 | ms/batch 61.45 | loss 3.50 | ppl 33.28\n",
"| epoch 42 | 1600/ 3181 batches | lr 0.61 | ms/batch 61.41 | loss 3.49 | ppl 32.66\n",
"| epoch 42 | 1800/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.49 | ppl 32.80\n",
"| epoch 42 | 2000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.47 | ppl 32.27\n",
"| epoch 42 | 2200/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.46 | ppl 31.79\n",
"| epoch 42 | 2400/ 3181 batches | lr 0.61 | ms/batch 61.44 | loss 3.48 | ppl 32.32\n",
"| epoch 42 | 2600/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.42 | ppl 30.42\n",
"| epoch 42 | 2800/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.50 | ppl 32.97\n",
"| epoch 42 | 3000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.40 | ppl 29.94\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 42 | time: 206.74s | valid loss 5.92 | valid ppl 371.93\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 43 | 200/ 3181 batches | lr 0.58 | ms/batch 61.68 | loss 3.53 | ppl 34.15\n",
"| epoch 43 | 400/ 3181 batches | lr 0.58 | ms/batch 61.40 | loss 3.47 | ppl 32.05\n",
"| epoch 43 | 600/ 3181 batches | lr 0.58 | ms/batch 61.38 | loss 3.44 | ppl 31.09\n",
"| epoch 43 | 800/ 3181 batches | lr 0.58 | ms/batch 61.37 | loss 3.50 | ppl 33.06\n",
"| epoch 43 | 1000/ 3181 batches | lr 0.58 | ms/batch 61.42 | loss 3.51 | ppl 33.36\n",
"| epoch 43 | 1200/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 31.98\n",
"| epoch 43 | 1400/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.50 | ppl 32.97\n",
"| epoch 43 | 1600/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 32.29\n",
"| epoch 43 | 1800/ 3181 batches | lr 0.58 | ms/batch 61.46 | loss 3.47 | ppl 32.22\n",
"| epoch 43 | 2000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.46 | ppl 31.72\n",
"| epoch 43 | 2200/ 3181 batches | lr 0.58 | ms/batch 61.33 | loss 3.45 | ppl 31.37\n",
"| epoch 43 | 2400/ 3181 batches | lr 0.58 | ms/batch 61.36 | loss 3.46 | ppl 31.81\n",
"| epoch 43 | 2600/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.41 | ppl 30.17\n",
"| epoch 43 | 2800/ 3181 batches | lr 0.58 | ms/batch 61.44 | loss 3.49 | ppl 32.63\n",
"| epoch 43 | 3000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.39 | ppl 29.80\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 43 | time: 206.72s | valid loss 5.98 | valid ppl 394.70\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 44 | 200/ 3181 batches | lr 0.55 | ms/batch 61.69 | loss 3.52 | ppl 33.79\n",
"| epoch 44 | 400/ 3181 batches | lr 0.55 | ms/batch 61.47 | loss 3.46 | ppl 31.81\n",
"| epoch 44 | 600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.42 | ppl 30.69\n",
"| epoch 44 | 800/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.48 | ppl 32.49\n",
"| epoch 44 | 1000/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.49 | ppl 32.84\n",
"| epoch 44 | 1200/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.45 | ppl 31.56\n",
"| epoch 44 | 1400/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.48 | ppl 32.59\n",
"| epoch 44 | 1600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.46 | ppl 31.93\n",
"| epoch 44 | 1800/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.46 | ppl 31.94\n",
"| epoch 44 | 2000/ 3181 batches | lr 0.55 | ms/batch 61.36 | loss 3.45 | ppl 31.62\n",
"| epoch 44 | 2200/ 3181 batches | lr 0.55 | ms/batch 61.37 | loss 3.44 | ppl 31.16\n",
"| epoch 44 | 2400/ 3181 batches | lr 0.55 | ms/batch 61.35 | loss 3.45 | ppl 31.47\n",
"| epoch 44 | 2600/ 3181 batches | lr 0.55 | ms/batch 61.40 | loss 3.39 | ppl 29.77\n",
"| epoch 44 | 2800/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.47 | ppl 32.19\n",
"| epoch 44 | 3000/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.38 | ppl 29.40\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 44 | time: 206.75s | valid loss 5.96 | valid ppl 389.15\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 45 | 200/ 3181 batches | lr 0.52 | ms/batch 61.74 | loss 3.51 | ppl 33.51\n",
"| epoch 45 | 400/ 3181 batches | lr 0.52 | ms/batch 61.41 | loss 3.45 | ppl 31.39\n",
"| epoch 45 | 600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.42 | ppl 30.55\n",
"| epoch 45 | 800/ 3181 batches | lr 0.52 | ms/batch 61.35 | loss 3.48 | ppl 32.50\n",
"| epoch 45 | 1000/ 3181 batches | lr 0.52 | ms/batch 61.38 | loss 3.48 | ppl 32.51\n",
"| epoch 45 | 1200/ 3181 batches | lr 0.52 | ms/batch 61.33 | loss 3.45 | ppl 31.39\n",
"| epoch 45 | 1400/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.47 | ppl 32.20\n",
"| epoch 45 | 1600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.45 | ppl 31.63\n",
"| epoch 45 | 1800/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.45 | ppl 31.61\n",
"| epoch 45 | 2000/ 3181 batches | lr 0.52 | ms/batch 61.36 | loss 3.44 | ppl 31.17\n",
"| epoch 45 | 2200/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.43 | ppl 31.02\n",
"| epoch 45 | 2400/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.44 | ppl 31.18\n",
"| epoch 45 | 2600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.39 | ppl 29.52\n",
"| epoch 45 | 2800/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.46 | ppl 31.72\n",
"| epoch 45 | 3000/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.37 | ppl 29.15\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 45 | time: 206.77s | valid loss 5.99 | valid ppl 398.09\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 46 | 200/ 3181 batches | lr 0.50 | ms/batch 61.71 | loss 3.50 | ppl 33.04\n",
"| epoch 46 | 400/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.44 | ppl 31.04\n",
"| epoch 46 | 600/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.41 | ppl 30.26\n",
"| epoch 46 | 800/ 3181 batches | lr 0.50 | ms/batch 61.47 | loss 3.47 | ppl 32.01\n",
"| epoch 46 | 1000/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.47 | ppl 32.08\n",
"| epoch 46 | 1200/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.43 | ppl 30.86\n",
"| epoch 46 | 1400/ 3181 batches | lr 0.50 | ms/batch 61.34 | loss 3.47 | ppl 32.15\n",
"| epoch 46 | 1600/ 3181 batches | lr 0.50 | ms/batch 61.44 | loss 3.44 | ppl 31.32\n",
"| epoch 46 | 1800/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.45 | ppl 31.49\n",
"| epoch 46 | 2000/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.44 | ppl 31.04\n",
"| epoch 46 | 2200/ 3181 batches | lr 0.50 | ms/batch 61.48 | loss 3.42 | ppl 30.63\n",
"| epoch 46 | 2400/ 3181 batches | lr 0.50 | ms/batch 61.35 | loss 3.43 | ppl 30.95\n",
"| epoch 46 | 2600/ 3181 batches | lr 0.50 | ms/batch 61.37 | loss 3.38 | ppl 29.38\n",
"| epoch 46 | 2800/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.45 | ppl 31.45\n",
"| epoch 46 | 3000/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.37 | ppl 28.96\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 46 | time: 206.77s | valid loss 5.96 | valid ppl 389.00\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 47 | 200/ 3181 batches | lr 0.47 | ms/batch 61.74 | loss 3.49 | ppl 32.78\n",
"| epoch 47 | 400/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.76\n",
"| epoch 47 | 600/ 3181 batches | lr 0.47 | ms/batch 61.47 | loss 3.40 | ppl 29.86\n",
"| epoch 47 | 800/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.86\n",
"| epoch 47 | 1000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.46 | ppl 31.90\n",
"| epoch 47 | 1200/ 3181 batches | lr 0.47 | ms/batch 61.38 | loss 3.42 | ppl 30.71\n",
"| epoch 47 | 1400/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.91\n",
"| epoch 47 | 1600/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 31.00\n",
"| epoch 47 | 1800/ 3181 batches | lr 0.47 | ms/batch 61.54 | loss 3.44 | ppl 31.18\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| epoch 47 | 2000/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.73\n",
"| epoch 47 | 2200/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.41 | ppl 30.37\n",
"| epoch 47 | 2400/ 3181 batches | lr 0.47 | ms/batch 61.52 | loss 3.42 | ppl 30.63\n",
"| epoch 47 | 2600/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.37 | ppl 28.98\n",
"| epoch 47 | 2800/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.45 | ppl 31.34\n",
"| epoch 47 | 3000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.35 | ppl 28.50\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 47 | time: 206.90s | valid loss 5.96 | valid ppl 388.68\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 48 | 200/ 3181 batches | lr 0.45 | ms/batch 61.70 | loss 3.48 | ppl 32.61\n",
"| epoch 48 | 400/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.51\n",
"| epoch 48 | 600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.39 | ppl 29.65\n",
"| epoch 48 | 800/ 3181 batches | lr 0.45 | ms/batch 61.36 | loss 3.46 | ppl 31.70\n",
"| epoch 48 | 1000/ 3181 batches | lr 0.45 | ms/batch 61.50 | loss 3.46 | ppl 31.66\n",
"| epoch 48 | 1200/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.42 | ppl 30.56\n",
"| epoch 48 | 1400/ 3181 batches | lr 0.45 | ms/batch 61.49 | loss 3.45 | ppl 31.65\n",
"| epoch 48 | 1600/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.66\n",
"| epoch 48 | 1800/ 3181 batches | lr 0.45 | ms/batch 61.41 | loss 3.43 | ppl 30.74\n",
"| epoch 48 | 2000/ 3181 batches | lr 0.45 | ms/batch 61.40 | loss 3.42 | ppl 30.48\n",
"| epoch 48 | 2200/ 3181 batches | lr 0.45 | ms/batch 61.47 | loss 3.41 | ppl 30.33\n",
"| epoch 48 | 2400/ 3181 batches | lr 0.45 | ms/batch 61.38 | loss 3.41 | ppl 30.38\n",
"| epoch 48 | 2600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.36 | ppl 28.89\n",
"| epoch 48 | 2800/ 3181 batches | lr 0.45 | ms/batch 61.42 | loss 3.43 | ppl 30.96\n",
"| epoch 48 | 3000/ 3181 batches | lr 0.45 | ms/batch 61.39 | loss 3.34 | ppl 28.33\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 48 | time: 206.81s | valid loss 5.95 | valid ppl 383.84\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 49 | 200/ 3181 batches | lr 0.43 | ms/batch 61.73 | loss 3.47 | ppl 32.23\n",
"| epoch 49 | 400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.33\n",
"| epoch 49 | 600/ 3181 batches | lr 0.43 | ms/batch 61.43 | loss 3.38 | ppl 29.52\n",
"| epoch 49 | 800/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.45 | ppl 31.40\n",
"| epoch 49 | 1000/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.46 | ppl 31.72\n",
"| epoch 49 | 1200/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.41 | ppl 30.25\n",
"| epoch 49 | 1400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.45 | ppl 31.41\n",
"| epoch 49 | 1600/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.42 | ppl 30.56\n",
"| epoch 49 | 1800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.75\n",
"| epoch 49 | 2000/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.28\n",
"| epoch 49 | 2200/ 3181 batches | lr 0.43 | ms/batch 61.45 | loss 3.41 | ppl 30.14\n",
"| epoch 49 | 2400/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.41 | ppl 30.22\n",
"| epoch 49 | 2600/ 3181 batches | lr 0.43 | ms/batch 61.44 | loss 3.35 | ppl 28.48\n",
"| epoch 49 | 2800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.72\n",
"| epoch 49 | 3000/ 3181 batches | lr 0.43 | ms/batch 61.37 | loss 3.34 | ppl 28.18\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 49 | time: 206.79s | valid loss 5.98 | valid ppl 395.98\n",
"-----------------------------------------------------------------------------------------\n",
"| epoch 50 | 200/ 3181 batches | lr 0.40 | ms/batch 61.71 | loss 3.47 | ppl 32.06\n",
"| epoch 50 | 400/ 3181 batches | lr 0.40 | ms/batch 61.39 | loss 3.41 | ppl 30.15\n",
"| epoch 50 | 600/ 3181 batches | lr 0.40 | ms/batch 61.37 | loss 3.38 | ppl 29.27\n",
"| epoch 50 | 800/ 3181 batches | lr 0.40 | ms/batch 61.42 | loss 3.43 | ppl 31.02\n",
"| epoch 50 | 1000/ 3181 batches | lr 0.40 | ms/batch 61.34 | loss 3.44 | ppl 31.16\n",
"| epoch 50 | 1200/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 29.97\n",
"| epoch 50 | 1400/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.44 | ppl 31.23\n",
"| epoch 50 | 1600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.41 | ppl 30.24\n",
"| epoch 50 | 1800/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.42 | ppl 30.64\n",
"| epoch 50 | 2000/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 30.07\n",
"| epoch 50 | 2200/ 3181 batches | lr 0.40 | ms/batch 61.49 | loss 3.39 | ppl 29.78\n",
"| epoch 50 | 2400/ 3181 batches | lr 0.40 | ms/batch 61.41 | loss 3.40 | ppl 29.98\n",
"| epoch 50 | 2600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.35 | ppl 28.40\n",
"| epoch 50 | 2800/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.43 | ppl 30.72\n",
"| epoch 50 | 3000/ 3181 batches | lr 0.40 | ms/batch 61.44 | loss 3.34 | ppl 28.08\n",
"-----------------------------------------------------------------------------------------\n",
"| end of epoch 50 | time: 206.77s | valid loss 6.01 | valid ppl 407.69\n",
"-----------------------------------------------------------------------------------------\n"
]
}
],
"source": [
"best_val_loss = float('inf')\n",
"epochs = 50\n",
"best_model = None\n",
"\n",
"for epoch in range(1, epochs + 1):\n",
" epoch_start_time = time.time()\n",
" train(model)\n",
" val_loss = evaluate(model, val_data)\n",
" val_ppl = math.exp(val_loss)\n",
" elapsed = time.time() - epoch_start_time\n",
" print('-' * 89)\n",
" print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '\n",
" f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')\n",
" print('-' * 89)\n",
"\n",
" if val_loss < best_val_loss:\n",
" best_val_loss = val_loss\n",
" best_model = copy.deepcopy(model)\n",
"\n",
" scheduler.step()"
]
},
{
"cell_type": "markdown",
"id": "f0d32419",
"metadata": {},
"source": [
"### print info about best model after training"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "12fdd0aa",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=========================================================================================\n",
"| End of training | test loss 5.36 | test ppl 213.09\n",
"=========================================================================================\n"
]
}
],
"source": [
"test_loss = evaluate(best_model, test_data)\n",
"test_ppl = math.exp(test_loss)\n",
"print('=' * 89)\n",
"print(f'| End of training | test loss {test_loss:5.2f} | '\n",
" f'test ppl {test_ppl:8.2f}')\n",
"print('=' * 89)"
]
},
{
"cell_type": "markdown",
"id": "528c9f10",
"metadata": {},
"source": [
"### save trained model to file"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "848af399",
"metadata": {},
"outputs": [],
"source": [
"torch.save(best_model.state_dict(), \"autocomplete_model\")"
]
},
{
"cell_type": "markdown",
"id": "09df56cf",
"metadata": {},
"source": [
"## Now we can try to predict based on trained model"
]
},
{
"cell_type": "markdown",
"id": "e685d3e1",
"metadata": {},
"source": [
"### define input batch "
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "cfb30fe0",
"metadata": {},
"outputs": [],
"source": [
"sample_batch = [\n",
" \"The brain is\",\n",
" \"The lung is\"\n",
"]\n",
"input_batch = sample_batch"
]
},
{
"cell_type": "markdown",
"id": "10d51d39",
"metadata": {},
"source": [
"### define initial source mask for model"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "305853e8",
"metadata": {},
"outputs": [],
"source": [
"bptt = 3\n",
"src_mask = generate_square_subsequent_mask(bptt).to(device)"
]
},
{
"cell_type": "markdown",
"id": "fe250072",
"metadata": {},
"source": [
"### define iterator for predict batch "
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "afe585d6",
"metadata": {},
"outputs": [],
"source": [
"def predict_abstract_iter():\n",
" for batch in sample_batch:\n",
" yield tokenizer(batch)"
]
},
{
"cell_type": "markdown",
"id": "b043de0a",
"metadata": {},
"source": [
"### load data into tensor for model to process"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "8bfaa8bd",
"metadata": {},
"outputs": [],
"source": [
"def toDataTensor():\n",
" predict_generator = predict_abstract_iter()\n",
" return [torch.tensor(vocab.lookup_indices(item)) for item in predict_generator]"
]
},
{
"cell_type": "markdown",
"id": "a800ffea",
"metadata": {},
"source": [
"### check device once again (prob not needed)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "6e2c35ba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cuda')"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"device"
]
},
{
"cell_type": "markdown",
"id": "bef90722",
"metadata": {},
"source": [
"### optionally load model from file if it was trained already"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "223eed8a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<All keys matched successfully>"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_model.load_state_dict(torch.load(\"autocomplete_model\"))"
]
},
{
"cell_type": "markdown",
"id": "dd71bdfc",
"metadata": {},
"source": [
"### define predict function"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "64223e87",
"metadata": {},
"outputs": [],
"source": [
"def predict(input_line, mask, n_predictions=3):\n",
" print('\\n> %s' % input_line)\n",
" with torch.no_grad():\n",
" output = best_model(input_line.to(device), mask)\n",
"\n",
" # Get top N categories\n",
" topv, topi = output.topk(n_predictions, 1, True)\n",
"\n",
" predictions = []\n",
" for i in range(n_predictions):\n",
" value = topv[0][i]\n",
" v1, v2 = value.topk(1)\n",
" predict_token_index = v2.cpu().detach().numpy()\n",
" print(\"predict token index: \", predict_token_index)\n",
" predictions.append(vocab.lookup_token(predict_token_index))\n",
" return predictions"
]
},
{
"cell_type": "markdown",
"id": "5b33b9f3",
"metadata": {},
"source": [
"### Execute prediction and display predicted values and choose continuation"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "b2895698",
"metadata": {},
"outputs": [],
"source": [
"def predict_loop(num_of_pred):\n",
" iteration = 0\n",
" is_terminated = False\n",
" input_batch = sample_batch\n",
" while(not is_terminated):\n",
" # 2*count is need because spaces count aswell\n",
" mask_size = bptt+(iteration) \n",
" src_mask = generate_square_subsequent_mask(mask_size).to(device)\n",
" data = toDataTensor()\n",
" for i, d in enumerate(data):\n",
" predictions = predict(d, src_mask, num_of_pred)\n",
" print(\"Current input:\", i)\n",
" print(input_batch[i])\n",
" print(\"Possible continuations:\")\n",
" for j in range(len(predictions)):\n",
" print(j + 1, \": \", predictions[j])\n",
" s_index = input(\"Choose continuation by index:\")\n",
" if(\"e\" in s_index):\n",
" is_terminated = True\n",
" print(\"prediction stopped.\")\n",
" break\n",
"\n",
" print(\"Text is now:\")\n",
" input_batch[i] += (\" \" + predictions[int(s_index) -1])\n",
" print(input_batch[i])\n",
"\n",
" iteration = iteration + 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "13ed9298",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"> tensor([ 3, 161, 18])\n",
"predict token index: [2]\n",
"predict token index: [5]\n",
"predict token index: [3]\n",
"Current input: 0\n",
"The brain is\n",
"Possible continuations:\n",
"1 : ,\n",
"2 : of\n",
"3 : the\n",
"Choose continuation by index:3\n",
"Text is now:\n",
"The brain is the\n",
"\n",
"> tensor([ 3, 374, 18])\n",
"predict token index: [2]\n",
"predict token index: [5]\n",
"predict token index: [183]\n",
"Current input: 1\n",
"The lung is\n",
"Possible continuations:\n",
"1 : ,\n",
"2 : of\n",
"3 : identified\n",
"Choose continuation by index:3\n",
"Text is now:\n",
"The lung is identified\n",
"\n",
"> tensor([ 3, 161, 18, 3])\n",
"predict token index: [2]\n",
"predict token index: [5]\n",
"predict token index: [132]\n",
"Current input: 0\n",
"The brain is the\n",
"Possible continuations:\n",
"1 : ,\n",
"2 : of\n",
"3 : most\n",
"Choose continuation by index:3\n",
"Text is now:\n",
"The brain is the most\n",
"\n",
"> tensor([ 3, 374, 18, 183])\n",
"predict token index: [2]\n",
"predict token index: [5]\n",
"predict token index: [8]\n",
"Current input: 1\n",
"The lung is identified\n",
"Possible continuations:\n",
"1 : ,\n",
"2 : of\n",
"3 : in\n",
"Choose continuation by index:1\n",
"Text is now:\n",
"The lung is identified ,\n",
"\n",
"> tensor([ 3, 161, 18, 3, 132])\n",
"predict token index: [258]\n",
"predict token index: [5]\n",
"predict token index: [5]\n",
"Current input: 0\n",
"The brain is the most\n",
"Possible continuations:\n",
"1 : common\n",
"2 : of\n",
"3 : of\n",
"Choose continuation by index:1\n",
"Text is now:\n",
"The brain is the most common\n",
"\n",
"> tensor([ 3, 374, 18, 183, 2])\n",
"predict token index: [4]\n",
"predict token index: [4]\n",
"predict token index: [3]\n",
"Current input: 1\n",
"The lung is identified ,\n",
"Possible continuations:\n",
"1 : and\n",
"2 : and\n",
"3 : the\n",
"Choose continuation by index:3\n",
"Text is now:\n",
"The lung is identified , the\n",
"\n",
"> tensor([ 3, 161, 18, 3, 132, 258])\n",
"predict token index: [258]\n",
"predict token index: [1]\n",
"predict token index: [5]\n",
"Current input: 0\n",
"The brain is the most common\n",
"Possible continuations:\n",
"1 : common\n",
"2 : .\n",
"3 : of\n",
"Choose continuation by index:3\n",
"Text is now:\n",
"The brain is the most common of\n",
"\n",
"> tensor([ 3, 374, 18, 183, 2, 3])\n",
"predict token index: [4]\n",
"predict token index: [4]\n",
"predict token index: [3]\n",
"Current input: 1\n",
"The lung is identified , the\n",
"Possible continuations:\n",
"1 : and\n",
"2 : and\n",
"3 : the\n"
]
}
],
"source": [
"predict_loop(3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}