diff --git a/AutomaticSentenceCompletion.ipynb b/AutomaticSentenceCompletion.ipynb index 513fd5b..70ddfe1 100644 --- a/AutomaticSentenceCompletion.ipynb +++ b/AutomaticSentenceCompletion.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 25, "id": "e444b44c", "metadata": {}, "outputs": [], @@ -51,15 +51,15 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "id": "adfb256a", "metadata": {}, "outputs": [], "source": [ "def getPapers(myQuery, maxPapers, myEmail =\"leonard.starke@mailbox.tu-dresden.de\"):\n", " # Get articles from PubMed\n", - " Entrez.email =myEmail\n", - " record =Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n", + " Entrez.email = myEmail\n", + " record = Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n", " idlist = record[\"IdList\"]\n", " print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n", " records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n", @@ -76,17 +76,17 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "bf797cc6", + "execution_count": 4, + "id": "39c3b352", "metadata": {}, "outputs": [], "source": [ - "amountOfPapers = 20000" + "amountOfPapers = 100000" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "id": "00481ec9", "metadata": {}, "outputs": [ @@ -95,12 +95,12 @@ "output_type": "stream", "text": [ "\n", - "There are 20000 records for Cancer [tiab].\n" + "There are 9999 records for Blood [tiab].\n" ] } ], "source": [ - "myQuery =\"Cancer [tiab]\" #query in title and abstract\n", + "myQuery =\"Blood [tiab]\" #query in title and abstract\n", "maxPapers = amountOfPapers\n", "records = getPapers(myQuery, maxPapers)" ] @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "id": "dcf5c217", "metadata": {}, "outputs": [], @@ -136,19 +136,10 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 30, "id": "c3199444", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/hein/.local/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "try:\n", " import torch\n", @@ -172,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 31, "id": "daca9db6", "metadata": {}, "outputs": [], @@ -184,6 +175,24 @@ " import numpy as np\n" ] }, + { + "cell_type": "markdown", + "id": "ec1db50b", + "metadata": {}, + "source": [ + "### import math module" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "eb32bd79", + "metadata": {}, + "outputs": [], + "source": [ + "import math" + ] + }, { "cell_type": "markdown", "id": "4df1e449", @@ -194,8 +203,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "0e838dae", + "execution_count": 34, + "id": "3f23404d", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "id": "8a128d3c", "metadata": {}, "outputs": [], @@ -218,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 36, "id": "97e89986", "metadata": {}, "outputs": [], @@ -230,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 37, "id": "0d6e89c4", "metadata": {}, "outputs": [], @@ -250,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 38, "id": "0bdbc40a", "metadata": {}, "outputs": [], @@ -266,12 +275,12 @@ "id": "37da40bb", "metadata": {}, "source": [ - "### Map every world to a id to store inside torch tensor" + "### Map every word to an id to store inside torch tensor" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 39, "id": "a438ab1f", "metadata": {}, "outputs": [], @@ -292,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 40, "id": "0e5bc361", "metadata": {}, "outputs": [], @@ -305,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 41, "id": "dfd7400d", "metadata": {}, "outputs": [], @@ -328,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 42, "id": "c155ee31", "metadata": {}, "outputs": [], @@ -338,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 43, "id": "79b2d248", "metadata": {}, "outputs": [ @@ -348,7 +357,7 @@ "device(type='cuda')" ] }, - "execution_count": 33, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -367,15 +376,13 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 46, "id": "a33d722f", "metadata": {}, "outputs": [], "source": [ - "import math\n", "from typing import Tuple\n", "\n", - "import torch\n", "from torch import nn, Tensor\n", "import torch.nn.functional as F\n", "from torch.nn import TransformerEncoder, TransformerEncoderLayer\n", @@ -433,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 47, "id": "c2f6d33b", "metadata": {}, "outputs": [], @@ -470,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 48, "id": "9e184841", "metadata": {}, "outputs": [], @@ -494,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 49, "id": "a4def1ac", "metadata": {}, "outputs": [], @@ -516,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 50, "id": "4ab5b8fd", "metadata": {}, "outputs": [], @@ -548,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 51, "id": "c53764da", "metadata": {}, "outputs": [], @@ -572,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 52, "id": "ddaa1d64", "metadata": {}, "outputs": [], @@ -596,12 +603,11 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 54, "id": "50ab3fb6", "metadata": {}, "outputs": [], "source": [ - "\n", "def train(model: nn.Module) -> None:\n", " model.train() # turn on train mode\n", " total_loss = 0.\n", @@ -646,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 55, "id": "3d179bb0", "metadata": {}, "outputs": [], @@ -677,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 56, "id": "09c4d4ce", "metadata": { "scrolled": true @@ -687,595 +693,593 @@ "name": "stdout", "output_type": "stream", "text": [ - "| epoch 1 | 200/ 2727 batches | lr 5.00 | ms/batch 94.99 | loss 8.87 | ppl 7091.23\n", - "| epoch 1 | 400/ 2727 batches | lr 5.00 | ms/batch 53.24 | loss 7.14 | ppl 1267.54\n", - "| epoch 1 | 600/ 2727 batches | lr 5.00 | ms/batch 53.30 | loss 6.57 | ppl 713.79\n", - "| epoch 1 | 800/ 2727 batches | lr 5.00 | ms/batch 53.30 | loss 6.34 | ppl 569.42\n", - "| epoch 1 | 1000/ 2727 batches | lr 5.00 | ms/batch 53.27 | loss 6.13 | ppl 460.15\n", - "| epoch 1 | 1200/ 2727 batches | lr 5.00 | ms/batch 53.36 | loss 6.04 | ppl 421.52\n", - "| epoch 1 | 1400/ 2727 batches | lr 5.00 | ms/batch 53.37 | loss 5.92 | ppl 371.29\n", - "| epoch 1 | 1600/ 2727 batches | lr 5.00 | ms/batch 53.36 | loss 5.76 | ppl 318.28\n", - "| epoch 1 | 1800/ 2727 batches | lr 5.00 | ms/batch 53.46 | loss 5.87 | ppl 354.01\n", - "| epoch 1 | 2000/ 2727 batches | lr 5.00 | ms/batch 53.47 | loss 5.80 | ppl 328.82\n", - "| epoch 1 | 2200/ 2727 batches | lr 5.00 | ms/batch 53.44 | loss 5.72 | ppl 304.20\n", - "| epoch 1 | 2400/ 2727 batches | lr 5.00 | ms/batch 53.45 | loss 5.75 | ppl 313.57\n", - "| epoch 1 | 2600/ 2727 batches | lr 5.00 | ms/batch 53.50 | loss 5.73 | ppl 307.48\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 1 | time: 160.70s | valid loss 5.61 | valid ppl 273.67\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 2 | 200/ 2727 batches | lr 4.75 | ms/batch 53.84 | loss 5.66 | ppl 286.69\n", - "| epoch 2 | 400/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.55 | ppl 258.35\n", - "| epoch 2 | 600/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.50 | ppl 245.61\n", - "| epoch 2 | 800/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.51 | ppl 248.30\n", - "| epoch 2 | 1000/ 2727 batches | lr 4.75 | ms/batch 53.60 | loss 5.44 | ppl 229.54\n", - "| epoch 2 | 1200/ 2727 batches | lr 4.75 | ms/batch 53.59 | loss 5.45 | ppl 233.38\n", - "| epoch 2 | 1400/ 2727 batches | lr 4.75 | ms/batch 53.62 | loss 5.39 | ppl 219.41\n", - "| epoch 2 | 1600/ 2727 batches | lr 4.75 | ms/batch 53.61 | loss 5.29 | ppl 197.66\n", - "| epoch 2 | 1800/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.44 | ppl 229.46\n", - "| epoch 2 | 2000/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.40 | ppl 221.35\n", - "| epoch 2 | 2200/ 2727 batches | lr 4.75 | ms/batch 53.59 | loss 5.35 | ppl 210.15\n", - "| epoch 2 | 2400/ 2727 batches | lr 4.75 | ms/batch 53.57 | loss 5.39 | ppl 219.29\n", - "| epoch 2 | 2600/ 2727 batches | lr 4.75 | ms/batch 53.58 | loss 5.39 | ppl 220.01\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 2 | time: 152.99s | valid loss 5.43 | valid ppl 228.25\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 3 | 200/ 2727 batches | lr 4.51 | ms/batch 53.88 | loss 5.37 | ppl 213.86\n", - "| epoch 3 | 400/ 2727 batches | lr 4.51 | ms/batch 53.59 | loss 5.27 | ppl 195.07\n", - "| epoch 3 | 600/ 2727 batches | lr 4.51 | ms/batch 53.61 | loss 5.25 | ppl 189.68\n", - "| epoch 3 | 800/ 2727 batches | lr 4.51 | ms/batch 53.59 | loss 5.26 | ppl 192.06\n", - "| epoch 3 | 1000/ 2727 batches | lr 4.51 | ms/batch 53.61 | loss 5.18 | ppl 177.42\n", - "| epoch 3 | 1200/ 2727 batches | lr 4.51 | ms/batch 53.58 | loss 5.23 | ppl 186.03\n", - "| epoch 3 | 1400/ 2727 batches | lr 4.51 | ms/batch 53.57 | loss 5.16 | ppl 174.77\n", - "| epoch 3 | 1600/ 2727 batches | lr 4.51 | ms/batch 53.58 | loss 5.07 | ppl 158.60\n", - "| epoch 3 | 1800/ 2727 batches | lr 4.51 | ms/batch 53.57 | loss 5.21 | ppl 182.78\n", - "| epoch 3 | 2000/ 2727 batches | lr 4.51 | ms/batch 53.55 | loss 5.19 | ppl 179.00\n", - "| epoch 3 | 2200/ 2727 batches | lr 4.51 | ms/batch 53.57 | loss 5.14 | ppl 171.26\n", - "| epoch 3 | 2400/ 2727 batches | lr 4.51 | ms/batch 53.57 | loss 5.19 | ppl 179.55\n", - "| epoch 3 | 2600/ 2727 batches | lr 4.51 | ms/batch 53.58 | loss 5.19 | ppl 179.28\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 3 | time: 152.97s | valid loss 5.39 | valid ppl 218.86\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 4 | 200/ 2727 batches | lr 4.29 | ms/batch 53.86 | loss 5.19 | ppl 178.71\n", - "| epoch 4 | 400/ 2727 batches | lr 4.29 | ms/batch 53.61 | loss 5.10 | ppl 164.53\n", - "| epoch 4 | 600/ 2727 batches | lr 4.29 | ms/batch 53.58 | loss 5.08 | ppl 161.14\n", - "| epoch 4 | 800/ 2727 batches | lr 4.29 | ms/batch 53.61 | loss 5.09 | ppl 162.64\n", - "| epoch 4 | 1000/ 2727 batches | lr 4.29 | ms/batch 53.60 | loss 5.03 | ppl 152.20\n", - "| epoch 4 | 1200/ 2727 batches | lr 4.29 | ms/batch 53.58 | loss 5.06 | ppl 158.23\n", - "| epoch 4 | 1400/ 2727 batches | lr 4.29 | ms/batch 53.61 | loss 5.01 | ppl 150.33\n", - "| epoch 4 | 1600/ 2727 batches | lr 4.29 | ms/batch 53.57 | loss 4.91 | ppl 136.30\n", - "| epoch 4 | 1800/ 2727 batches | lr 4.29 | ms/batch 53.60 | loss 5.06 | ppl 158.00\n", - "| epoch 4 | 2000/ 2727 batches | lr 4.29 | ms/batch 53.58 | loss 5.04 | ppl 154.90\n", - "| epoch 4 | 2200/ 2727 batches | lr 4.29 | ms/batch 53.58 | loss 5.00 | ppl 148.52\n", - "| epoch 4 | 2400/ 2727 batches | lr 4.29 | ms/batch 53.59 | loss 5.03 | ppl 153.47\n", - "| epoch 4 | 2600/ 2727 batches | lr 4.29 | ms/batch 53.59 | loss 5.04 | ppl 155.21\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 4 | time: 152.99s | valid loss 5.31 | valid ppl 202.73\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 5 | 200/ 2727 batches | lr 4.07 | ms/batch 53.89 | loss 5.05 | ppl 155.53\n", - "| epoch 5 | 400/ 2727 batches | lr 4.07 | ms/batch 53.60 | loss 4.96 | ppl 143.06\n", - "| epoch 5 | 600/ 2727 batches | lr 4.07 | ms/batch 53.60 | loss 4.95 | ppl 141.66\n", - "| epoch 5 | 800/ 2727 batches | lr 4.07 | ms/batch 53.60 | loss 4.96 | ppl 142.29\n", - "| epoch 5 | 1000/ 2727 batches | lr 4.07 | ms/batch 53.58 | loss 4.90 | ppl 134.14\n", - "| epoch 5 | 1200/ 2727 batches | lr 4.07 | ms/batch 53.59 | loss 4.94 | ppl 139.54\n", - "| epoch 5 | 1400/ 2727 batches | lr 4.07 | ms/batch 53.63 | loss 4.89 | ppl 132.99\n", - "| epoch 5 | 1600/ 2727 batches | lr 4.07 | ms/batch 53.59 | loss 4.79 | ppl 120.88\n", - "| epoch 5 | 1800/ 2727 batches | lr 4.07 | ms/batch 53.60 | loss 4.93 | ppl 138.94\n", - "| epoch 5 | 2000/ 2727 batches | lr 4.07 | ms/batch 53.61 | loss 4.92 | ppl 136.69\n", - "| epoch 5 | 2200/ 2727 batches | lr 4.07 | ms/batch 53.60 | loss 4.89 | ppl 132.41\n", - "| epoch 5 | 2400/ 2727 batches | lr 4.07 | ms/batch 53.58 | loss 4.92 | ppl 137.61\n", - "| epoch 5 | 2600/ 2727 batches | lr 4.07 | ms/batch 53.58 | loss 4.93 | ppl 138.39\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 5 | time: 153.00s | valid loss 5.29 | valid ppl 198.47\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 6 | 200/ 2727 batches | lr 3.87 | ms/batch 53.84 | loss 4.94 | ppl 139.65\n", - "| epoch 6 | 400/ 2727 batches | lr 3.87 | ms/batch 53.58 | loss 4.86 | ppl 128.66\n", - "| epoch 6 | 600/ 2727 batches | lr 3.87 | ms/batch 53.58 | loss 4.85 | ppl 127.88\n", - "| epoch 6 | 800/ 2727 batches | lr 3.87 | ms/batch 53.53 | loss 4.85 | ppl 127.75\n", - "| epoch 6 | 1000/ 2727 batches | lr 3.87 | ms/batch 53.53 | loss 4.79 | ppl 120.47\n", - "| epoch 6 | 1200/ 2727 batches | lr 3.87 | ms/batch 53.58 | loss 4.83 | ppl 125.73\n", - "| epoch 6 | 1400/ 2727 batches | lr 3.87 | ms/batch 53.57 | loss 4.79 | ppl 119.80\n", - "| epoch 6 | 1600/ 2727 batches | lr 3.87 | ms/batch 53.54 | loss 4.70 | ppl 109.73\n", - "| epoch 6 | 1800/ 2727 batches | lr 3.87 | ms/batch 53.57 | loss 4.83 | ppl 125.26\n", - "| epoch 6 | 2000/ 2727 batches | lr 3.87 | ms/batch 53.54 | loss 4.81 | ppl 123.24\n", - "| epoch 6 | 2200/ 2727 batches | lr 3.87 | ms/batch 53.55 | loss 4.78 | ppl 118.89\n", - "| epoch 6 | 2400/ 2727 batches | lr 3.87 | ms/batch 53.54 | loss 4.81 | ppl 122.48\n", - "| epoch 6 | 2600/ 2727 batches | lr 3.87 | ms/batch 53.57 | loss 4.82 | ppl 124.55\n" + "| epoch 1 | 200/ 3181 batches | lr 5.00 | ms/batch 101.32 | loss 9.07 | ppl 8713.01\n", + "| epoch 1 | 400/ 3181 batches | lr 5.00 | ms/batch 60.68 | loss 7.32 | ppl 1516.45\n", + "| epoch 1 | 600/ 3181 batches | lr 5.00 | ms/batch 60.86 | loss 6.78 | ppl 878.02\n", + "| epoch 1 | 800/ 3181 batches | lr 5.00 | ms/batch 60.93 | loss 6.44 | ppl 628.78\n", + "| epoch 1 | 1000/ 3181 batches | lr 5.00 | ms/batch 60.95 | loss 6.31 | ppl 551.05\n", + "| epoch 1 | 1200/ 3181 batches | lr 5.00 | ms/batch 60.99 | loss 6.19 | ppl 486.01\n", + "| epoch 1 | 1400/ 3181 batches | lr 5.00 | ms/batch 61.07 | loss 6.09 | ppl 441.65\n", + "| epoch 1 | 1600/ 3181 batches | lr 5.00 | ms/batch 61.08 | loss 6.07 | ppl 431.75\n", + "| epoch 1 | 1800/ 3181 batches | lr 5.00 | ms/batch 61.06 | loss 6.00 | ppl 403.14\n", + "| epoch 1 | 2000/ 3181 batches | lr 5.00 | ms/batch 61.16 | loss 5.91 | ppl 367.09\n", + "| epoch 1 | 2200/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.89 | ppl 359.65\n", + "| epoch 1 | 2400/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.86 | ppl 349.26\n", + "| epoch 1 | 2600/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.77 | ppl 319.99\n", + "| epoch 1 | 2800/ 3181 batches | lr 5.00 | ms/batch 61.29 | loss 5.80 | ppl 330.72\n", + "| epoch 1 | 3000/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.71 | ppl 303.17\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 1 | time: 213.85s | valid loss 5.73 | valid ppl 307.24\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 2 | 200/ 3181 batches | lr 4.75 | ms/batch 61.64 | loss 5.71 | ppl 302.70\n", + "| epoch 2 | 400/ 3181 batches | lr 4.75 | ms/batch 61.23 | loss 5.63 | ppl 279.76\n", + "| epoch 2 | 600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.63 | ppl 277.70\n", + "| epoch 2 | 800/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.56 | ppl 260.48\n", + "| epoch 2 | 1000/ 3181 batches | lr 4.75 | ms/batch 61.24 | loss 5.58 | ppl 266.00\n", + "| epoch 2 | 1200/ 3181 batches | lr 4.75 | ms/batch 61.37 | loss 5.55 | ppl 257.88\n", + "| epoch 2 | 1400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.53 | ppl 251.27\n", + "| epoch 2 | 1600/ 3181 batches | lr 4.75 | ms/batch 61.30 | loss 5.55 | ppl 255.98\n", + "| epoch 2 | 1800/ 3181 batches | lr 4.75 | ms/batch 61.29 | loss 5.53 | ppl 253.01\n", + "| epoch 2 | 2000/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.48 | ppl 238.90\n", + "| epoch 2 | 2200/ 3181 batches | lr 4.75 | ms/batch 61.35 | loss 5.46 | ppl 235.16\n", + "| epoch 2 | 2400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.46 | ppl 235.11\n", + "| epoch 2 | 2600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.40 | ppl 221.12\n", + "| epoch 2 | 2800/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.46 | ppl 234.30\n", + "| epoch 2 | 3000/ 3181 batches | lr 4.75 | ms/batch 61.28 | loss 5.37 | ppl 214.39\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 2 | time: 206.48s | valid loss 5.53 | valid ppl 252.22\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 3 | 200/ 3181 batches | lr 4.51 | ms/batch 61.62 | loss 5.42 | ppl 226.39\n", + "| epoch 3 | 400/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.36 | ppl 212.24\n", + "| epoch 3 | 600/ 3181 batches | lr 4.51 | ms/batch 61.31 | loss 5.34 | ppl 209.08\n", + "| epoch 3 | 800/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.31 | ppl 201.91\n", + "| epoch 3 | 1000/ 3181 batches | lr 4.51 | ms/batch 61.29 | loss 5.33 | ppl 207.08\n", + "| epoch 3 | 1200/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.30 | ppl 200.84\n", + "| epoch 3 | 1400/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.29 | ppl 198.48\n", + "| epoch 3 | 1600/ 3181 batches | lr 4.51 | ms/batch 61.30 | loss 5.31 | ppl 202.12\n", + "| epoch 3 | 1800/ 3181 batches | lr 4.51 | ms/batch 61.35 | loss 5.30 | ppl 200.79\n", + "| epoch 3 | 2000/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.26 | ppl 191.59\n", + "| epoch 3 | 2200/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.25 | ppl 190.89\n", + "| epoch 3 | 2400/ 3181 batches | lr 4.51 | ms/batch 61.39 | loss 5.25 | ppl 190.57\n", + "| epoch 3 | 2600/ 3181 batches | lr 4.51 | ms/batch 61.42 | loss 5.19 | ppl 180.17\n", + "| epoch 3 | 2800/ 3181 batches | lr 4.51 | ms/batch 61.38 | loss 5.26 | ppl 191.72\n", + "| epoch 3 | 3000/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.18 | ppl 177.08\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 3 | time: 206.57s | valid loss 5.44 | valid ppl 231.07\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 4 | 200/ 3181 batches | lr 4.29 | ms/batch 61.64 | loss 5.25 | ppl 190.59\n", + "| epoch 4 | 400/ 3181 batches | lr 4.29 | ms/batch 61.50 | loss 5.19 | ppl 178.85\n", + "| epoch 4 | 600/ 3181 batches | lr 4.29 | ms/batch 61.26 | loss 5.17 | ppl 176.66\n", + "| epoch 4 | 800/ 3181 batches | lr 4.29 | ms/batch 61.36 | loss 5.15 | ppl 172.78\n", + "| epoch 4 | 1000/ 3181 batches | lr 4.29 | ms/batch 61.32 | loss 5.19 | ppl 179.49\n", + "| epoch 4 | 1200/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.15 | ppl 172.65\n", + "| epoch 4 | 1400/ 3181 batches | lr 4.29 | ms/batch 61.39 | loss 5.14 | ppl 170.97\n", + "| epoch 4 | 1600/ 3181 batches | lr 4.29 | ms/batch 61.42 | loss 5.16 | ppl 174.44\n", + "| epoch 4 | 1800/ 3181 batches | lr 4.29 | ms/batch 61.40 | loss 5.16 | ppl 174.19\n", + "| epoch 4 | 2000/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.11 | ppl 166.50\n", + "| epoch 4 | 2200/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.10 | ppl 164.42\n", + "| epoch 4 | 2400/ 3181 batches | lr 4.29 | ms/batch 61.33 | loss 5.11 | ppl 165.60\n", + "| epoch 4 | 2600/ 3181 batches | lr 4.29 | ms/batch 61.41 | loss 5.06 | ppl 157.61\n", + "| epoch 4 | 2800/ 3181 batches | lr 4.29 | ms/batch 61.29 | loss 5.12 | ppl 167.69\n", + "| epoch 4 | 3000/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.05 | ppl 155.52\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 4 | time: 206.63s | valid loss 5.39 | valid ppl 218.67\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 5 | 200/ 3181 batches | lr 4.07 | ms/batch 61.64 | loss 5.13 | ppl 168.25\n", + "| epoch 5 | 400/ 3181 batches | lr 4.07 | ms/batch 61.30 | loss 5.06 | ppl 156.82\n", + "| epoch 5 | 600/ 3181 batches | lr 4.07 | ms/batch 61.38 | loss 5.04 | ppl 155.08\n", + "| epoch 5 | 800/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 5.03 | ppl 152.77\n", + "| epoch 5 | 1000/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.05 | ppl 156.69\n", + "| epoch 5 | 1200/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 5.02 | ppl 151.80\n", + "| epoch 5 | 1400/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 5.02 | ppl 151.68\n", + "| epoch 5 | 1600/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.03 | ppl 152.67\n", + "| epoch 5 | 1800/ 3181 batches | lr 4.07 | ms/batch 61.39 | loss 5.03 | ppl 152.77\n", + "| epoch 5 | 2000/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 4.99 | ppl 147.43\n", + "| epoch 5 | 2200/ 3181 batches | lr 4.07 | ms/batch 61.23 | loss 4.98 | ppl 145.22\n", + "| epoch 5 | 2400/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 4.99 | ppl 146.65\n", + "| epoch 5 | 2600/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 4.94 | ppl 140.06\n", + "| epoch 5 | 2800/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 5.01 | ppl 149.53\n", + "| epoch 5 | 3000/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 4.92 | ppl 136.69\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 5 | time: 206.57s | valid loss 5.41 | valid ppl 223.14\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 6 | 200/ 3181 batches | lr 3.87 | ms/batch 61.65 | loss 5.01 | ppl 149.89\n", + "| epoch 6 | 400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.94 | ppl 140.08\n", + "| epoch 6 | 600/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.93 | ppl 137.75\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 6 | time: 152.89s | valid loss 5.30 | valid ppl 200.09\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 7 | 200/ 2727 batches | lr 3.68 | ms/batch 53.82 | loss 4.83 | ppl 125.67\n", - "| epoch 7 | 400/ 2727 batches | lr 3.68 | ms/batch 53.57 | loss 4.75 | ppl 115.28\n", - "| epoch 7 | 600/ 2727 batches | lr 3.68 | ms/batch 53.57 | loss 4.74 | ppl 114.88\n", - "| epoch 7 | 800/ 2727 batches | lr 3.68 | ms/batch 53.59 | loss 4.75 | ppl 115.36\n", - "| epoch 7 | 1000/ 2727 batches | lr 3.68 | ms/batch 53.84 | loss 4.69 | ppl 109.13\n", - "| epoch 7 | 1200/ 2727 batches | lr 3.68 | ms/batch 53.60 | loss 4.74 | ppl 114.06\n", - "| epoch 7 | 1400/ 2727 batches | lr 3.68 | ms/batch 53.53 | loss 4.69 | ppl 109.22\n", - "| epoch 7 | 1600/ 2727 batches | lr 3.68 | ms/batch 53.60 | loss 4.61 | ppl 100.03\n", - "| epoch 7 | 1800/ 2727 batches | lr 3.68 | ms/batch 53.54 | loss 4.73 | ppl 112.89\n", - "| epoch 7 | 2000/ 2727 batches | lr 3.68 | ms/batch 53.54 | loss 4.71 | ppl 111.57\n", - "| epoch 7 | 2200/ 2727 batches | lr 3.68 | ms/batch 53.56 | loss 4.68 | ppl 107.95\n", - "| epoch 7 | 2400/ 2727 batches | lr 3.68 | ms/batch 53.53 | loss 4.71 | ppl 111.61\n", - "| epoch 7 | 2600/ 2727 batches | lr 3.68 | ms/batch 53.54 | loss 4.73 | ppl 112.94\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 7 | time: 152.93s | valid loss 5.28 | valid ppl 196.84\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 8 | 200/ 2727 batches | lr 3.49 | ms/batch 53.84 | loss 4.74 | ppl 114.63\n", - "| epoch 8 | 400/ 2727 batches | lr 3.49 | ms/batch 53.54 | loss 4.67 | ppl 106.30\n", - "| epoch 8 | 600/ 2727 batches | lr 3.49 | ms/batch 53.55 | loss 4.66 | ppl 105.59\n", - "| epoch 8 | 800/ 2727 batches | lr 3.49 | ms/batch 53.53 | loss 4.66 | ppl 105.78\n", - "| epoch 8 | 1000/ 2727 batches | lr 3.49 | ms/batch 53.54 | loss 4.61 | ppl 100.14\n", - "| epoch 8 | 1200/ 2727 batches | lr 3.49 | ms/batch 53.50 | loss 4.65 | ppl 104.40\n", - "| epoch 8 | 1400/ 2727 batches | lr 3.49 | ms/batch 53.54 | loss 4.61 | ppl 100.21\n", - "| epoch 8 | 1600/ 2727 batches | lr 3.49 | ms/batch 53.53 | loss 4.52 | ppl 91.89\n", - "| epoch 8 | 1800/ 2727 batches | lr 3.49 | ms/batch 53.54 | loss 4.64 | ppl 103.21\n", - "| epoch 8 | 2000/ 2727 batches | lr 3.49 | ms/batch 53.53 | loss 4.63 | ppl 102.02\n", - "| epoch 8 | 2200/ 2727 batches | lr 3.49 | ms/batch 53.54 | loss 4.60 | ppl 99.14\n", - "| epoch 8 | 2400/ 2727 batches | lr 3.49 | ms/batch 53.68 | loss 4.63 | ppl 102.63\n", - "| epoch 8 | 2600/ 2727 batches | lr 3.49 | ms/batch 53.52 | loss 4.64 | ppl 103.81\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 8 | time: 152.84s | valid loss 5.30 | valid ppl 200.97\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 9 | 200/ 2727 batches | lr 3.32 | ms/batch 53.78 | loss 4.66 | ppl 105.35\n", - "| epoch 9 | 400/ 2727 batches | lr 3.32 | ms/batch 53.55 | loss 4.57 | ppl 96.89\n", - "| epoch 9 | 600/ 2727 batches | lr 3.32 | ms/batch 53.55 | loss 4.57 | ppl 96.68\n", - "| epoch 9 | 800/ 2727 batches | lr 3.32 | ms/batch 53.52 | loss 4.58 | ppl 97.08\n", - "| epoch 9 | 1000/ 2727 batches | lr 3.32 | ms/batch 53.54 | loss 4.53 | ppl 92.32\n", - "| epoch 9 | 1200/ 2727 batches | lr 3.32 | ms/batch 53.54 | loss 4.57 | ppl 96.21\n", - "| epoch 9 | 1400/ 2727 batches | lr 3.32 | ms/batch 53.55 | loss 4.52 | ppl 91.97\n", - "| epoch 9 | 1600/ 2727 batches | lr 3.32 | ms/batch 53.54 | loss 4.44 | ppl 84.91\n", - "| epoch 9 | 1800/ 2727 batches | lr 3.32 | ms/batch 53.52 | loss 4.55 | ppl 94.87\n", - "| epoch 9 | 2000/ 2727 batches | lr 3.32 | ms/batch 53.56 | loss 4.55 | ppl 94.49\n", - "| epoch 9 | 2200/ 2727 batches | lr 3.32 | ms/batch 53.54 | loss 4.52 | ppl 91.54\n", - "| epoch 9 | 2400/ 2727 batches | lr 3.32 | ms/batch 53.56 | loss 4.55 | ppl 94.55\n", - "| epoch 9 | 2600/ 2727 batches | lr 3.32 | ms/batch 53.55 | loss 4.55 | ppl 95.05\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 9 | time: 152.85s | valid loss 5.29 | valid ppl 198.51\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 10 | 200/ 2727 batches | lr 3.15 | ms/batch 53.81 | loss 4.58 | ppl 97.48\n", - "| epoch 10 | 400/ 2727 batches | lr 3.15 | ms/batch 53.56 | loss 4.50 | ppl 90.24\n", - "| epoch 10 | 600/ 2727 batches | lr 3.15 | ms/batch 53.55 | loss 4.50 | ppl 90.01\n", - "| epoch 10 | 800/ 2727 batches | lr 3.15 | ms/batch 53.55 | loss 4.49 | ppl 89.46\n", - "| epoch 10 | 1000/ 2727 batches | lr 3.15 | ms/batch 53.54 | loss 4.45 | ppl 85.29\n", - "| epoch 10 | 1200/ 2727 batches | lr 3.15 | ms/batch 53.55 | loss 4.49 | ppl 89.10\n", - "| epoch 10 | 1400/ 2727 batches | lr 3.15 | ms/batch 53.56 | loss 4.45 | ppl 85.22\n", - "| epoch 10 | 1600/ 2727 batches | lr 3.15 | ms/batch 53.52 | loss 4.38 | ppl 79.46\n", - "| epoch 10 | 1800/ 2727 batches | lr 3.15 | ms/batch 53.51 | loss 4.47 | ppl 87.24\n", - "| epoch 10 | 2000/ 2727 batches | lr 3.15 | ms/batch 53.52 | loss 4.47 | ppl 87.13\n", - "| epoch 10 | 2200/ 2727 batches | lr 3.15 | ms/batch 53.53 | loss 4.44 | ppl 85.14\n", - "| epoch 10 | 2400/ 2727 batches | lr 3.15 | ms/batch 53.53 | loss 4.47 | ppl 87.55\n", - "| epoch 10 | 2600/ 2727 batches | lr 3.15 | ms/batch 53.53 | loss 4.47 | ppl 87.64\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 10 | time: 152.82s | valid loss 5.32 | valid ppl 203.65\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 11 | 200/ 2727 batches | lr 2.99 | ms/batch 53.82 | loss 4.50 | ppl 90.42\n", - "| epoch 11 | 400/ 2727 batches | lr 2.99 | ms/batch 53.53 | loss 4.43 | ppl 83.56\n", - "| epoch 11 | 600/ 2727 batches | lr 2.99 | ms/batch 53.54 | loss 4.43 | ppl 83.85\n", - "| epoch 11 | 800/ 2727 batches | lr 2.99 | ms/batch 53.54 | loss 4.42 | ppl 83.09\n", - "| epoch 11 | 1000/ 2727 batches | lr 2.99 | ms/batch 53.54 | loss 4.38 | ppl 79.94\n", - "| epoch 11 | 1200/ 2727 batches | lr 2.99 | ms/batch 53.52 | loss 4.42 | ppl 83.14\n", - "| epoch 11 | 1400/ 2727 batches | lr 2.99 | ms/batch 53.54 | loss 4.37 | ppl 79.22\n", - "| epoch 11 | 1600/ 2727 batches | lr 2.99 | ms/batch 53.52 | loss 4.31 | ppl 74.26\n", - "| epoch 11 | 1800/ 2727 batches | lr 2.99 | ms/batch 53.57 | loss 4.40 | ppl 81.43\n", - "| epoch 11 | 2000/ 2727 batches | lr 2.99 | ms/batch 53.56 | loss 4.40 | ppl 81.52\n", - "| epoch 11 | 2200/ 2727 batches | lr 2.99 | ms/batch 53.54 | loss 4.38 | ppl 79.78\n", - "| epoch 11 | 2400/ 2727 batches | lr 2.99 | ms/batch 53.51 | loss 4.40 | ppl 81.41\n", - "| epoch 11 | 2600/ 2727 batches | lr 2.99 | ms/batch 53.53 | loss 4.41 | ppl 82.35\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 11 | time: 152.82s | valid loss 5.32 | valid ppl 204.69\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 12 | 200/ 2727 batches | lr 2.84 | ms/batch 53.79 | loss 4.43 | ppl 84.16\n", - "| epoch 12 | 400/ 2727 batches | lr 2.84 | ms/batch 53.53 | loss 4.36 | ppl 77.93\n", - "| epoch 12 | 600/ 2727 batches | lr 2.84 | ms/batch 53.55 | loss 4.36 | ppl 78.43\n", - "| epoch 12 | 800/ 2727 batches | lr 2.84 | ms/batch 53.53 | loss 4.36 | ppl 78.11\n", - "| epoch 12 | 1000/ 2727 batches | lr 2.84 | ms/batch 53.53 | loss 4.30 | ppl 73.88\n", - "| epoch 12 | 1200/ 2727 batches | lr 2.84 | ms/batch 53.54 | loss 4.35 | ppl 77.59\n", - "| epoch 12 | 1400/ 2727 batches | lr 2.84 | ms/batch 53.54 | loss 4.31 | ppl 74.45\n", - "| epoch 12 | 1600/ 2727 batches | lr 2.84 | ms/batch 53.49 | loss 4.24 | ppl 69.74\n", - "| epoch 12 | 1800/ 2727 batches | lr 2.84 | ms/batch 53.57 | loss 4.33 | ppl 76.30\n", - "| epoch 12 | 2000/ 2727 batches | lr 2.84 | ms/batch 53.51 | loss 4.33 | ppl 75.93\n" + "| epoch 6 | 800/ 3181 batches | lr 3.87 | ms/batch 61.31 | loss 4.91 | ppl 135.89\n", + "| epoch 6 | 1000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.95 | ppl 141.06\n", + "| epoch 6 | 1200/ 3181 batches | lr 3.87 | ms/batch 61.38 | loss 4.90 | ppl 134.49\n", + "| epoch 6 | 1400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.91 | ppl 135.28\n", + "| epoch 6 | 1600/ 3181 batches | lr 3.87 | ms/batch 61.37 | loss 4.91 | ppl 136.26\n", + "| epoch 6 | 1800/ 3181 batches | lr 3.87 | ms/batch 61.36 | loss 4.93 | ppl 137.81\n", + "| epoch 6 | 2000/ 3181 batches | lr 3.87 | ms/batch 61.40 | loss 4.88 | ppl 131.80\n", + "| epoch 6 | 2200/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.87 | ppl 130.59\n", + "| epoch 6 | 2400/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.87 | ppl 130.95\n", + "| epoch 6 | 2600/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.83 | ppl 125.04\n", + "| epoch 6 | 2800/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.91 | ppl 135.49\n", + "| epoch 6 | 3000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.81 | ppl 122.53\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 6 | time: 206.60s | valid loss 5.37 | valid ppl 214.67\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 7 | 200/ 3181 batches | lr 3.68 | ms/batch 61.66 | loss 4.91 | ppl 135.30\n", + "| epoch 7 | 400/ 3181 batches | lr 3.68 | ms/batch 61.23 | loss 4.83 | ppl 125.76\n", + "| epoch 7 | 600/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.52\n", + "| epoch 7 | 800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.82 | ppl 123.59\n", + "| epoch 7 | 1000/ 3181 batches | lr 3.68 | ms/batch 61.35 | loss 4.85 | ppl 127.65\n", + "| epoch 7 | 1200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.81 | ppl 122.34\n", + "| epoch 7 | 1400/ 3181 batches | lr 3.68 | ms/batch 61.38 | loss 4.81 | ppl 123.22\n", + "| epoch 7 | 1600/ 3181 batches | lr 3.68 | ms/batch 61.34 | loss 4.82 | ppl 123.82\n", + "| epoch 7 | 1800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.22\n", + "| epoch 7 | 2000/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.79 | ppl 119.76\n", + "| epoch 7 | 2200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.78 | ppl 118.99\n", + "| epoch 7 | 2400/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.78 | ppl 118.59\n", + "| epoch 7 | 2600/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.73 | ppl 113.60\n", + "| epoch 7 | 2800/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.80 | ppl 122.09\n", + "| epoch 7 | 3000/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.71 | ppl 111.19\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 7 | time: 206.61s | valid loss 5.35 | valid ppl 210.55\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 8 | 200/ 3181 batches | lr 3.49 | ms/batch 61.69 | loss 4.81 | ppl 122.20\n", + "| epoch 8 | 400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.74 | ppl 114.11\n", + "| epoch 8 | 600/ 3181 batches | lr 3.49 | ms/batch 61.41 | loss 4.73 | ppl 113.82\n", + "| epoch 8 | 800/ 3181 batches | lr 3.49 | ms/batch 61.35 | loss 4.73 | ppl 113.04\n", + "| epoch 8 | 1000/ 3181 batches | lr 3.49 | ms/batch 61.46 | loss 4.75 | ppl 115.84\n", + "| epoch 8 | 1200/ 3181 batches | lr 3.49 | ms/batch 61.43 | loss 4.71 | ppl 111.58\n", + "| epoch 8 | 1400/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.72 | ppl 111.84\n", + "| epoch 8 | 1600/ 3181 batches | lr 3.49 | ms/batch 61.39 | loss 4.72 | ppl 112.52\n", + "| epoch 8 | 1800/ 3181 batches | lr 3.49 | ms/batch 61.44 | loss 4.74 | ppl 114.44\n", + "| epoch 8 | 2000/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.70 | ppl 109.63\n", + "| epoch 8 | 2200/ 3181 batches | lr 3.49 | ms/batch 61.31 | loss 4.68 | ppl 108.29\n", + "| epoch 8 | 2400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.69 | ppl 108.78\n", + "| epoch 8 | 2600/ 3181 batches | lr 3.49 | ms/batch 61.30 | loss 4.64 | ppl 103.90\n", + "| epoch 8 | 2800/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.72 | ppl 111.83\n", + "| epoch 8 | 3000/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.62 | ppl 101.24\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 8 | time: 206.60s | valid loss 5.34 | valid ppl 208.08\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 9 | 200/ 3181 batches | lr 3.32 | ms/batch 61.64 | loss 4.72 | ppl 111.95\n", + "| epoch 9 | 400/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.65 | ppl 104.38\n", + "| epoch 9 | 600/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.97\n", + "| epoch 9 | 800/ 3181 batches | lr 3.32 | ms/batch 61.32 | loss 4.64 | ppl 103.60\n", + "| epoch 9 | 1000/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.68 | ppl 107.40\n", + "| epoch 9 | 1200/ 3181 batches | lr 3.32 | ms/batch 61.39 | loss 4.62 | ppl 101.89\n", + "| epoch 9 | 1400/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.60\n", + "| epoch 9 | 1600/ 3181 batches | lr 3.32 | ms/batch 61.30 | loss 4.64 | ppl 103.54\n", + "| epoch 9 | 1800/ 3181 batches | lr 3.32 | ms/batch 61.31 | loss 4.66 | ppl 105.35\n", + "| epoch 9 | 2000/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.62 | ppl 101.24\n", + "| epoch 9 | 2200/ 3181 batches | lr 3.32 | ms/batch 61.28 | loss 4.60 | ppl 99.91\n", + "| epoch 9 | 2400/ 3181 batches | lr 3.32 | ms/batch 61.34 | loss 4.61 | ppl 100.17\n", + "| epoch 9 | 2600/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.56 | ppl 95.58\n", + "| epoch 9 | 2800/ 3181 batches | lr 3.32 | ms/batch 61.43 | loss 4.63 | ppl 102.81\n", + "| epoch 9 | 3000/ 3181 batches | lr 3.32 | ms/batch 61.38 | loss 4.54 | ppl 93.66\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 9 | time: 206.58s | valid loss 5.35 | valid ppl 209.83\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 10 | 200/ 3181 batches | lr 3.15 | ms/batch 61.64 | loss 4.64 | ppl 103.90\n", + "| epoch 10 | 400/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.88\n", + "| epoch 10 | 600/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.86\n", + "| epoch 10 | 800/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.56 | ppl 95.84\n", + "| epoch 10 | 1000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.59 | ppl 98.74\n", + "| epoch 10 | 1200/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.55 | ppl 94.35\n", + "| epoch 10 | 1400/ 3181 batches | lr 3.15 | ms/batch 61.27 | loss 4.56 | ppl 95.77\n", + "| epoch 10 | 1600/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.55 | ppl 94.76\n", + "| epoch 10 | 1800/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.99\n", + "| epoch 10 | 2000/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.54 | ppl 93.41\n", + "| epoch 10 | 2200/ 3181 batches | lr 3.15 | ms/batch 61.29 | loss 4.53 | ppl 92.30\n", + "| epoch 10 | 2400/ 3181 batches | lr 3.15 | ms/batch 61.36 | loss 4.53 | ppl 92.36\n", + "| epoch 10 | 2600/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.48 | ppl 88.41\n", + "| epoch 10 | 2800/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.78\n", + "| epoch 10 | 3000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.46 | ppl 86.56\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 10 | time: 206.54s | valid loss 5.38 | valid ppl 216.73\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 11 | 200/ 3181 batches | lr 2.99 | ms/batch 61.65 | loss 4.57 | ppl 96.07\n", + "| epoch 11 | 400/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.50 | ppl 89.75\n", + "| epoch 11 | 600/ 3181 batches | lr 2.99 | ms/batch 61.39 | loss 4.49 | ppl 88.98\n", + "| epoch 11 | 800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.49 | ppl 89.43\n", + "| epoch 11 | 1000/ 3181 batches | lr 2.99 | ms/batch 61.36 | loss 4.52 | ppl 92.09\n", + "| epoch 11 | 1200/ 3181 batches | lr 2.99 | ms/batch 61.45 | loss 4.47 | ppl 87.68\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 12 | 2200/ 2727 batches | lr 2.84 | ms/batch 53.53 | loss 4.31 | ppl 74.43\n", - "| epoch 12 | 2400/ 2727 batches | lr 2.84 | ms/batch 53.52 | loss 4.33 | ppl 76.31\n", - "| epoch 12 | 2600/ 2727 batches | lr 2.84 | ms/batch 53.52 | loss 4.35 | ppl 77.29\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 12 | time: 152.80s | valid loss 5.38 | valid ppl 215.98\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 13 | 200/ 2727 batches | lr 2.70 | ms/batch 53.81 | loss 4.37 | ppl 79.06\n", - "| epoch 13 | 400/ 2727 batches | lr 2.70 | ms/batch 53.56 | loss 4.29 | ppl 72.96\n", - "| epoch 13 | 600/ 2727 batches | lr 2.70 | ms/batch 53.56 | loss 4.30 | ppl 73.51\n", - "| epoch 13 | 800/ 2727 batches | lr 2.70 | ms/batch 53.55 | loss 4.29 | ppl 72.86\n", - "| epoch 13 | 1000/ 2727 batches | lr 2.70 | ms/batch 53.54 | loss 4.25 | ppl 69.97\n", - "| epoch 13 | 1200/ 2727 batches | lr 2.70 | ms/batch 53.53 | loss 4.29 | ppl 73.18\n", - "| epoch 13 | 1400/ 2727 batches | lr 2.70 | ms/batch 53.53 | loss 4.24 | ppl 69.60\n", - "| epoch 13 | 1600/ 2727 batches | lr 2.70 | ms/batch 53.53 | loss 4.18 | ppl 65.65\n", - "| epoch 13 | 1800/ 2727 batches | lr 2.70 | ms/batch 53.51 | loss 4.27 | ppl 71.60\n", - "| epoch 13 | 2000/ 2727 batches | lr 2.70 | ms/batch 53.51 | loss 4.27 | ppl 71.30\n", - "| epoch 13 | 2200/ 2727 batches | lr 2.70 | ms/batch 53.55 | loss 4.25 | ppl 70.30\n", - "| epoch 13 | 2400/ 2727 batches | lr 2.70 | ms/batch 53.52 | loss 4.28 | ppl 71.95\n", - "| epoch 13 | 2600/ 2727 batches | lr 2.70 | ms/batch 53.54 | loss 4.28 | ppl 72.54\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 13 | time: 152.81s | valid loss 5.39 | valid ppl 220.01\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 14 | 200/ 2727 batches | lr 2.57 | ms/batch 53.82 | loss 4.31 | ppl 74.32\n", - "| epoch 14 | 400/ 2727 batches | lr 2.57 | ms/batch 53.53 | loss 4.23 | ppl 69.03\n", - "| epoch 14 | 600/ 2727 batches | lr 2.57 | ms/batch 53.51 | loss 4.24 | ppl 69.41\n", - "| epoch 14 | 800/ 2727 batches | lr 2.57 | ms/batch 53.54 | loss 4.24 | ppl 69.43\n", - "| epoch 14 | 1000/ 2727 batches | lr 2.57 | ms/batch 53.55 | loss 4.19 | ppl 65.95\n", - "| epoch 14 | 1200/ 2727 batches | lr 2.57 | ms/batch 53.56 | loss 4.23 | ppl 69.04\n", - "| epoch 14 | 1400/ 2727 batches | lr 2.57 | ms/batch 53.54 | loss 4.19 | ppl 65.93\n", - "| epoch 14 | 1600/ 2727 batches | lr 2.57 | ms/batch 53.56 | loss 4.14 | ppl 62.51\n", - "| epoch 14 | 1800/ 2727 batches | lr 2.57 | ms/batch 53.51 | loss 4.21 | ppl 67.67\n", - "| epoch 14 | 2000/ 2727 batches | lr 2.57 | ms/batch 53.57 | loss 4.20 | ppl 66.94\n", - "| epoch 14 | 2200/ 2727 batches | lr 2.57 | ms/batch 53.55 | loss 4.19 | ppl 66.28\n", - "| epoch 14 | 2400/ 2727 batches | lr 2.57 | ms/batch 53.56 | loss 4.22 | ppl 67.79\n", - "| epoch 14 | 2600/ 2727 batches | lr 2.57 | ms/batch 53.52 | loss 4.22 | ppl 68.15\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 14 | time: 152.84s | valid loss 5.39 | valid ppl 220.22\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 15 | 200/ 2727 batches | lr 2.44 | ms/batch 53.83 | loss 4.26 | ppl 70.62\n", - "| epoch 15 | 400/ 2727 batches | lr 2.44 | ms/batch 53.56 | loss 4.17 | ppl 64.96\n", - "| epoch 15 | 600/ 2727 batches | lr 2.44 | ms/batch 53.52 | loss 4.18 | ppl 65.63\n", - "| epoch 15 | 800/ 2727 batches | lr 2.44 | ms/batch 53.55 | loss 4.18 | ppl 65.28\n", - "| epoch 15 | 1000/ 2727 batches | lr 2.44 | ms/batch 53.59 | loss 4.14 | ppl 62.56\n", - "| epoch 15 | 1200/ 2727 batches | lr 2.44 | ms/batch 53.55 | loss 4.17 | ppl 64.85\n", - "| epoch 15 | 1400/ 2727 batches | lr 2.44 | ms/batch 53.58 | loss 4.13 | ppl 62.30\n", - "| epoch 15 | 1600/ 2727 batches | lr 2.44 | ms/batch 53.56 | loss 4.08 | ppl 59.29\n", - "| epoch 15 | 1800/ 2727 batches | lr 2.44 | ms/batch 53.56 | loss 4.15 | ppl 63.62\n", - "| epoch 15 | 2000/ 2727 batches | lr 2.44 | ms/batch 53.55 | loss 4.15 | ppl 63.55\n", - "| epoch 15 | 2200/ 2727 batches | lr 2.44 | ms/batch 53.52 | loss 4.14 | ppl 62.85\n", - "| epoch 15 | 2400/ 2727 batches | lr 2.44 | ms/batch 53.55 | loss 4.16 | ppl 64.08\n", - "| epoch 15 | 2600/ 2727 batches | lr 2.44 | ms/batch 53.57 | loss 4.17 | ppl 64.53\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 15 | time: 152.87s | valid loss 5.46 | valid ppl 235.52\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 16 | 200/ 2727 batches | lr 2.32 | ms/batch 53.84 | loss 4.20 | ppl 66.56\n", - "| epoch 16 | 400/ 2727 batches | lr 2.32 | ms/batch 53.54 | loss 4.13 | ppl 61.89\n", - "| epoch 16 | 600/ 2727 batches | lr 2.32 | ms/batch 53.56 | loss 4.13 | ppl 62.00\n", - "| epoch 16 | 800/ 2727 batches | lr 2.32 | ms/batch 53.57 | loss 4.12 | ppl 61.75\n", - "| epoch 16 | 1000/ 2727 batches | lr 2.32 | ms/batch 53.55 | loss 4.08 | ppl 59.16\n", - "| epoch 16 | 1200/ 2727 batches | lr 2.32 | ms/batch 53.56 | loss 4.12 | ppl 61.80\n", - "| epoch 16 | 1400/ 2727 batches | lr 2.32 | ms/batch 53.57 | loss 4.08 | ppl 58.94\n", - "| epoch 16 | 1600/ 2727 batches | lr 2.32 | ms/batch 53.54 | loss 4.03 | ppl 56.54\n", - "| epoch 16 | 1800/ 2727 batches | lr 2.32 | ms/batch 53.52 | loss 4.10 | ppl 60.59\n", - "| epoch 16 | 2000/ 2727 batches | lr 2.32 | ms/batch 53.58 | loss 4.10 | ppl 60.13\n", - "| epoch 16 | 2200/ 2727 batches | lr 2.32 | ms/batch 53.56 | loss 4.09 | ppl 59.79\n", - "| epoch 16 | 2400/ 2727 batches | lr 2.32 | ms/batch 53.60 | loss 4.11 | ppl 60.83\n", - "| epoch 16 | 2600/ 2727 batches | lr 2.32 | ms/batch 53.57 | loss 4.11 | ppl 61.11\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 16 | time: 152.88s | valid loss 5.41 | valid ppl 224.68\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 17 | 200/ 2727 batches | lr 2.20 | ms/batch 53.78 | loss 4.15 | ppl 63.27\n", - "| epoch 17 | 400/ 2727 batches | lr 2.20 | ms/batch 53.57 | loss 4.07 | ppl 58.62\n", - "| epoch 17 | 600/ 2727 batches | lr 2.20 | ms/batch 53.49 | loss 4.08 | ppl 59.06\n", - "| epoch 17 | 800/ 2727 batches | lr 2.20 | ms/batch 53.57 | loss 4.08 | ppl 58.99\n", - "| epoch 17 | 1000/ 2727 batches | lr 2.20 | ms/batch 53.57 | loss 4.03 | ppl 56.47\n", - "| epoch 17 | 1200/ 2727 batches | lr 2.20 | ms/batch 53.53 | loss 4.07 | ppl 58.70\n", - "| epoch 17 | 1400/ 2727 batches | lr 2.20 | ms/batch 53.51 | loss 4.03 | ppl 56.07\n", - "| epoch 17 | 1600/ 2727 batches | lr 2.20 | ms/batch 53.56 | loss 3.99 | ppl 53.92\n", - "| epoch 17 | 1800/ 2727 batches | lr 2.20 | ms/batch 53.57 | loss 4.05 | ppl 57.44\n", - "| epoch 17 | 2000/ 2727 batches | lr 2.20 | ms/batch 53.53 | loss 4.05 | ppl 57.22\n", - "| epoch 17 | 2200/ 2727 batches | lr 2.20 | ms/batch 53.55 | loss 4.05 | ppl 57.23\n", - "| epoch 17 | 2400/ 2727 batches | lr 2.20 | ms/batch 53.55 | loss 4.05 | ppl 57.45\n", - "| epoch 17 | 2600/ 2727 batches | lr 2.20 | ms/batch 53.59 | loss 4.06 | ppl 58.19\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 17 | time: 152.89s | valid loss 5.46 | valid ppl 234.56\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 18 | 200/ 2727 batches | lr 2.09 | ms/batch 53.84 | loss 4.10 | ppl 60.40\n", - "| epoch 18 | 400/ 2727 batches | lr 2.09 | ms/batch 53.60 | loss 4.03 | ppl 56.36\n", - "| epoch 18 | 600/ 2727 batches | lr 2.09 | ms/batch 53.55 | loss 4.03 | ppl 56.52\n", - "| epoch 18 | 800/ 2727 batches | lr 2.09 | ms/batch 53.51 | loss 4.03 | ppl 56.19\n", - "| epoch 18 | 1000/ 2727 batches | lr 2.09 | ms/batch 53.55 | loss 3.99 | ppl 54.11\n", - "| epoch 18 | 1200/ 2727 batches | lr 2.09 | ms/batch 53.52 | loss 4.02 | ppl 55.88\n", - "| epoch 18 | 1400/ 2727 batches | lr 2.09 | ms/batch 53.55 | loss 3.98 | ppl 53.29\n" + "| epoch 11 | 1400/ 3181 batches | lr 2.99 | ms/batch 61.38 | loss 4.49 | ppl 89.02\n", + "| epoch 11 | 1600/ 3181 batches | lr 2.99 | ms/batch 61.41 | loss 4.49 | ppl 89.15\n", + "| epoch 11 | 1800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.50 | ppl 90.22\n", + "| epoch 11 | 2000/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.46 | ppl 86.81\n", + "| epoch 11 | 2200/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.45 | ppl 85.80\n", + "| epoch 11 | 2400/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.46 | ppl 86.48\n", + "| epoch 11 | 2600/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.41 | ppl 82.18\n", + "| epoch 11 | 2800/ 3181 batches | lr 2.99 | ms/batch 61.40 | loss 4.48 | ppl 88.42\n", + "| epoch 11 | 3000/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.39 | ppl 80.87\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 11 | time: 206.64s | valid loss 5.39 | valid ppl 219.73\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 12 | 200/ 3181 batches | lr 2.84 | ms/batch 61.75 | loss 4.50 | ppl 89.97\n", + "| epoch 12 | 400/ 3181 batches | lr 2.84 | ms/batch 61.42 | loss 4.43 | ppl 84.04\n", + "| epoch 12 | 600/ 3181 batches | lr 2.84 | ms/batch 61.45 | loss 4.42 | ppl 83.14\n", + "| epoch 12 | 800/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.42 | ppl 83.42\n", + "| epoch 12 | 1000/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.46 | ppl 86.36\n", + "| epoch 12 | 1200/ 3181 batches | lr 2.84 | ms/batch 61.37 | loss 4.41 | ppl 82.13\n", + "| epoch 12 | 1400/ 3181 batches | lr 2.84 | ms/batch 61.32 | loss 4.42 | ppl 83.46\n", + "| epoch 12 | 1600/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.42 | ppl 82.96\n", + "| epoch 12 | 1800/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.44 | ppl 84.42\n", + "| epoch 12 | 2000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.40 | ppl 81.54\n", + "| epoch 12 | 2200/ 3181 batches | lr 2.84 | ms/batch 61.36 | loss 4.39 | ppl 80.50\n", + "| epoch 12 | 2400/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.39 | ppl 80.92\n", + "| epoch 12 | 2600/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.35 | ppl 77.30\n", + "| epoch 12 | 2800/ 3181 batches | lr 2.84 | ms/batch 61.39 | loss 4.42 | ppl 83.09\n", + "| epoch 12 | 3000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.33 | ppl 75.78\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 12 | time: 206.67s | valid loss 5.42 | valid ppl 224.91\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 13 | 200/ 3181 batches | lr 2.70 | ms/batch 61.68 | loss 4.43 | ppl 83.99\n", + "| epoch 13 | 400/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.36 | ppl 78.48\n", + "| epoch 13 | 600/ 3181 batches | lr 2.70 | ms/batch 61.33 | loss 4.35 | ppl 77.76\n", + "| epoch 13 | 800/ 3181 batches | lr 2.70 | ms/batch 61.31 | loss 4.37 | ppl 78.88\n", + "| epoch 13 | 1000/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.39 | ppl 80.64\n", + "| epoch 13 | 1200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.34 | ppl 76.95\n", + "| epoch 13 | 1400/ 3181 batches | lr 2.70 | ms/batch 61.41 | loss 4.36 | ppl 78.49\n", + "| epoch 13 | 1600/ 3181 batches | lr 2.70 | ms/batch 61.35 | loss 4.36 | ppl 77.93\n", + "| epoch 13 | 1800/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.37 | ppl 79.08\n", + "| epoch 13 | 2000/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.34 | ppl 76.68\n", + "| epoch 13 | 2200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.32 | ppl 75.17\n", + "| epoch 13 | 2400/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.33 | ppl 75.87\n", + "| epoch 13 | 2600/ 3181 batches | lr 2.70 | ms/batch 61.29 | loss 4.28 | ppl 72.20\n", + "| epoch 13 | 2800/ 3181 batches | lr 2.70 | ms/batch 61.27 | loss 4.36 | ppl 78.01\n", + "| epoch 13 | 3000/ 3181 batches | lr 2.70 | ms/batch 61.36 | loss 4.26 | ppl 70.91\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 13 | time: 206.58s | valid loss 5.42 | valid ppl 225.31\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 14 | 200/ 3181 batches | lr 2.57 | ms/batch 61.59 | loss 4.37 | ppl 79.39\n", + "| epoch 14 | 400/ 3181 batches | lr 2.57 | ms/batch 61.37 | loss 4.30 | ppl 73.94\n", + "| epoch 14 | 600/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.30 | ppl 73.50\n", + "| epoch 14 | 800/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.31 | ppl 74.12\n", + "| epoch 14 | 1000/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.33 | ppl 75.86\n", + "| epoch 14 | 1200/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.29 | ppl 72.64\n", + "| epoch 14 | 1400/ 3181 batches | lr 2.57 | ms/batch 61.39 | loss 4.31 | ppl 74.29\n", + "| epoch 14 | 1600/ 3181 batches | lr 2.57 | ms/batch 61.31 | loss 4.29 | ppl 73.17\n", + "| epoch 14 | 1800/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.31 | ppl 74.28\n", + "| epoch 14 | 2000/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.28 | ppl 71.97\n", + "| epoch 14 | 2200/ 3181 batches | lr 2.57 | ms/batch 61.44 | loss 4.26 | ppl 71.13\n", + "| epoch 14 | 2400/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.27 | ppl 71.61\n", + "| epoch 14 | 2600/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.22 | ppl 67.93\n", + "| epoch 14 | 2800/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.30 | ppl 73.68\n", + "| epoch 14 | 3000/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.21 | ppl 67.08\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 14 | time: 206.63s | valid loss 5.47 | valid ppl 236.36\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 15 | 200/ 3181 batches | lr 2.44 | ms/batch 61.66 | loss 4.32 | ppl 75.20\n", + "| epoch 15 | 400/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.25 | ppl 69.78\n", + "| epoch 15 | 600/ 3181 batches | lr 2.44 | ms/batch 61.30 | loss 4.23 | ppl 68.98\n", + "| epoch 15 | 800/ 3181 batches | lr 2.44 | ms/batch 61.34 | loss 4.25 | ppl 70.20\n", + "| epoch 15 | 1000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.28 | ppl 71.96\n", + "| epoch 15 | 1200/ 3181 batches | lr 2.44 | ms/batch 61.29 | loss 4.23 | ppl 68.62\n", + "| epoch 15 | 1400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 70.18\n", + "| epoch 15 | 1600/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.23 | ppl 68.99\n", + "| epoch 15 | 1800/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 69.87\n", + "| epoch 15 | 2000/ 3181 batches | lr 2.44 | ms/batch 61.36 | loss 4.22 | ppl 67.79\n", + "| epoch 15 | 2200/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.21 | ppl 67.21\n", + "| epoch 15 | 2400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.21 | ppl 67.61\n", + "| epoch 15 | 2600/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.15 | ppl 63.73\n", + "| epoch 15 | 2800/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.24 | ppl 69.43\n", + "| epoch 15 | 3000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.15 | ppl 63.16\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 15 | time: 206.62s | valid loss 5.47 | valid ppl 238.57\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 16 | 200/ 3181 batches | lr 2.32 | ms/batch 61.60 | loss 4.26 | ppl 71.14\n", + "| epoch 16 | 400/ 3181 batches | lr 2.32 | ms/batch 61.33 | loss 4.19 | ppl 65.93\n", + "| epoch 16 | 600/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.18 | ppl 65.22\n", + "| epoch 16 | 800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.07\n", + "| epoch 16 | 1000/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.22 | ppl 68.20\n", + "| epoch 16 | 1200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.17 | ppl 65.03\n", + "| epoch 16 | 1400/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.20 | ppl 66.72\n", + "| epoch 16 | 1600/ 3181 batches | lr 2.32 | ms/batch 61.38 | loss 4.19 | ppl 65.70\n", + "| epoch 16 | 1800/ 3181 batches | lr 2.32 | ms/batch 61.37 | loss 4.19 | ppl 66.27\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 18 | 1600/ 2727 batches | lr 2.09 | ms/batch 53.57 | loss 3.94 | ppl 51.31\n", - "| epoch 18 | 1800/ 2727 batches | lr 2.09 | ms/batch 53.55 | loss 4.00 | ppl 54.68\n", - "| epoch 18 | 2000/ 2727 batches | lr 2.09 | ms/batch 53.56 | loss 3.99 | ppl 54.32\n", - "| epoch 18 | 2200/ 2727 batches | lr 2.09 | ms/batch 53.55 | loss 4.00 | ppl 54.51\n", - "| epoch 18 | 2400/ 2727 batches | lr 2.09 | ms/batch 53.52 | loss 4.01 | ppl 54.88\n", - "| epoch 18 | 2600/ 2727 batches | lr 2.09 | ms/batch 53.54 | loss 4.02 | ppl 55.54\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 18 | time: 152.86s | valid loss 5.52 | valid ppl 249.11\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 19 | 200/ 2727 batches | lr 1.99 | ms/batch 53.84 | loss 4.05 | ppl 57.55\n", - "| epoch 19 | 400/ 2727 batches | lr 1.99 | ms/batch 53.54 | loss 3.98 | ppl 53.78\n", - "| epoch 19 | 600/ 2727 batches | lr 1.99 | ms/batch 53.60 | loss 3.98 | ppl 53.75\n", - "| epoch 19 | 800/ 2727 batches | lr 1.99 | ms/batch 53.55 | loss 3.98 | ppl 53.61\n", - "| epoch 19 | 1000/ 2727 batches | lr 1.99 | ms/batch 53.57 | loss 3.94 | ppl 51.43\n", - "| epoch 19 | 1200/ 2727 batches | lr 1.99 | ms/batch 53.59 | loss 3.98 | ppl 53.41\n", - "| epoch 19 | 1400/ 2727 batches | lr 1.99 | ms/batch 53.60 | loss 3.94 | ppl 51.20\n", - "| epoch 19 | 1600/ 2727 batches | lr 1.99 | ms/batch 53.57 | loss 3.90 | ppl 49.32\n", - "| epoch 19 | 1800/ 2727 batches | lr 1.99 | ms/batch 53.55 | loss 3.96 | ppl 52.57\n", - "| epoch 19 | 2000/ 2727 batches | lr 1.99 | ms/batch 53.52 | loss 3.95 | ppl 52.05\n", - "| epoch 19 | 2200/ 2727 batches | lr 1.99 | ms/batch 53.54 | loss 3.95 | ppl 52.16\n", - "| epoch 19 | 2400/ 2727 batches | lr 1.99 | ms/batch 53.55 | loss 3.96 | ppl 52.57\n", - "| epoch 19 | 2600/ 2727 batches | lr 1.99 | ms/batch 53.56 | loss 3.97 | ppl 53.06\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 19 | time: 152.88s | valid loss 5.50 | valid ppl 244.11\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 20 | 200/ 2727 batches | lr 1.89 | ms/batch 53.86 | loss 4.01 | ppl 55.25\n", - "| epoch 20 | 400/ 2727 batches | lr 1.89 | ms/batch 53.56 | loss 3.94 | ppl 51.37\n", - "| epoch 20 | 600/ 2727 batches | lr 1.89 | ms/batch 53.55 | loss 3.94 | ppl 51.51\n", - "| epoch 20 | 800/ 2727 batches | lr 1.89 | ms/batch 53.54 | loss 3.94 | ppl 51.36\n", - "| epoch 20 | 1000/ 2727 batches | lr 1.89 | ms/batch 53.55 | loss 3.90 | ppl 49.49\n", - "| epoch 20 | 1200/ 2727 batches | lr 1.89 | ms/batch 53.55 | loss 3.94 | ppl 51.19\n", - "| epoch 20 | 1400/ 2727 batches | lr 1.89 | ms/batch 53.53 | loss 3.89 | ppl 49.02\n", - "| epoch 20 | 1600/ 2727 batches | lr 1.89 | ms/batch 53.56 | loss 3.86 | ppl 47.33\n", - "| epoch 20 | 1800/ 2727 batches | lr 1.89 | ms/batch 53.52 | loss 3.91 | ppl 50.00\n", - "| epoch 20 | 2000/ 2727 batches | lr 1.89 | ms/batch 53.55 | loss 3.91 | ppl 49.91\n", - "| epoch 20 | 2200/ 2727 batches | lr 1.89 | ms/batch 53.53 | loss 3.91 | ppl 50.04\n", - "| epoch 20 | 2400/ 2727 batches | lr 1.89 | ms/batch 53.55 | loss 3.92 | ppl 50.39\n", - "| epoch 20 | 2600/ 2727 batches | lr 1.89 | ms/batch 53.53 | loss 3.92 | ppl 50.64\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 20 | time: 152.85s | valid loss 5.54 | valid ppl 253.74\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 21 | 200/ 2727 batches | lr 1.79 | ms/batch 53.83 | loss 3.97 | ppl 52.84\n", - "| epoch 21 | 400/ 2727 batches | lr 1.79 | ms/batch 53.58 | loss 3.90 | ppl 49.32\n", - "| epoch 21 | 600/ 2727 batches | lr 1.79 | ms/batch 53.55 | loss 3.90 | ppl 49.43\n", - "| epoch 21 | 800/ 2727 batches | lr 1.79 | ms/batch 53.56 | loss 3.90 | ppl 49.36\n", - "| epoch 21 | 1000/ 2727 batches | lr 1.79 | ms/batch 53.58 | loss 3.86 | ppl 47.32\n", - "| epoch 21 | 1200/ 2727 batches | lr 1.79 | ms/batch 53.55 | loss 3.89 | ppl 49.08\n", - "| epoch 21 | 1400/ 2727 batches | lr 1.79 | ms/batch 53.55 | loss 3.85 | ppl 47.10\n", - "| epoch 21 | 1600/ 2727 batches | lr 1.79 | ms/batch 53.57 | loss 3.82 | ppl 45.54\n", - "| epoch 21 | 1800/ 2727 batches | lr 1.79 | ms/batch 53.55 | loss 3.87 | ppl 48.00\n", - "| epoch 21 | 2000/ 2727 batches | lr 1.79 | ms/batch 53.52 | loss 3.87 | ppl 47.92\n", - "| epoch 21 | 2200/ 2727 batches | lr 1.79 | ms/batch 53.51 | loss 3.87 | ppl 48.13\n", - "| epoch 21 | 2400/ 2727 batches | lr 1.79 | ms/batch 53.59 | loss 3.88 | ppl 48.32\n", - "| epoch 21 | 2600/ 2727 batches | lr 1.79 | ms/batch 53.53 | loss 3.89 | ppl 48.84\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 21 | time: 152.88s | valid loss 5.53 | valid ppl 252.72\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 22 | 200/ 2727 batches | lr 1.70 | ms/batch 53.79 | loss 3.93 | ppl 50.71\n", - "| epoch 22 | 400/ 2727 batches | lr 1.70 | ms/batch 53.51 | loss 3.86 | ppl 47.49\n", - "| epoch 22 | 600/ 2727 batches | lr 1.70 | ms/batch 53.52 | loss 3.87 | ppl 47.74\n", - "| epoch 22 | 800/ 2727 batches | lr 1.70 | ms/batch 53.51 | loss 3.86 | ppl 47.44\n", - "| epoch 22 | 1000/ 2727 batches | lr 1.70 | ms/batch 53.47 | loss 3.82 | ppl 45.63\n", - "| epoch 22 | 1200/ 2727 batches | lr 1.70 | ms/batch 53.53 | loss 3.85 | ppl 47.21\n", - "| epoch 22 | 1400/ 2727 batches | lr 1.70 | ms/batch 53.55 | loss 3.82 | ppl 45.39\n", - "| epoch 22 | 1600/ 2727 batches | lr 1.70 | ms/batch 53.54 | loss 3.78 | ppl 43.90\n", - "| epoch 22 | 1800/ 2727 batches | lr 1.70 | ms/batch 53.53 | loss 3.83 | ppl 46.06\n", - "| epoch 22 | 2000/ 2727 batches | lr 1.70 | ms/batch 53.50 | loss 3.82 | ppl 45.82\n", - "| epoch 22 | 2200/ 2727 batches | lr 1.70 | ms/batch 53.56 | loss 3.83 | ppl 46.04\n", - "| epoch 22 | 2400/ 2727 batches | lr 1.70 | ms/batch 53.54 | loss 3.84 | ppl 46.38\n", - "| epoch 22 | 2600/ 2727 batches | lr 1.70 | ms/batch 53.55 | loss 3.85 | ppl 47.16\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 22 | time: 152.79s | valid loss 5.54 | valid ppl 254.96\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 23 | 200/ 2727 batches | lr 1.62 | ms/batch 53.83 | loss 3.89 | ppl 48.94\n", - "| epoch 23 | 400/ 2727 batches | lr 1.62 | ms/batch 53.58 | loss 3.82 | ppl 45.63\n", - "| epoch 23 | 600/ 2727 batches | lr 1.62 | ms/batch 53.55 | loss 3.83 | ppl 46.01\n", - "| epoch 23 | 800/ 2727 batches | lr 1.62 | ms/batch 53.56 | loss 3.82 | ppl 45.62\n", - "| epoch 23 | 1000/ 2727 batches | lr 1.62 | ms/batch 53.56 | loss 3.79 | ppl 44.07\n", - "| epoch 23 | 1200/ 2727 batches | lr 1.62 | ms/batch 53.58 | loss 3.82 | ppl 45.52\n", - "| epoch 23 | 1400/ 2727 batches | lr 1.62 | ms/batch 53.58 | loss 3.78 | ppl 43.71\n", - "| epoch 23 | 1600/ 2727 batches | lr 1.62 | ms/batch 53.53 | loss 3.75 | ppl 42.52\n", - "| epoch 23 | 1800/ 2727 batches | lr 1.62 | ms/batch 53.57 | loss 3.80 | ppl 44.67\n", - "| epoch 23 | 2000/ 2727 batches | lr 1.62 | ms/batch 53.57 | loss 3.79 | ppl 44.26\n", - "| epoch 23 | 2200/ 2727 batches | lr 1.62 | ms/batch 53.55 | loss 3.80 | ppl 44.73\n", - "| epoch 23 | 2400/ 2727 batches | lr 1.62 | ms/batch 53.59 | loss 3.80 | ppl 44.89\n", - "| epoch 23 | 2600/ 2727 batches | lr 1.62 | ms/batch 53.58 | loss 3.82 | ppl 45.39\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 23 | time: 152.90s | valid loss 5.57 | valid ppl 263.65\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 24 | 200/ 2727 batches | lr 1.54 | ms/batch 53.83 | loss 3.85 | ppl 47.20\n", - "| epoch 24 | 400/ 2727 batches | lr 1.54 | ms/batch 53.56 | loss 3.79 | ppl 44.15\n", - "| epoch 24 | 600/ 2727 batches | lr 1.54 | ms/batch 53.58 | loss 3.79 | ppl 44.42\n", - "| epoch 24 | 800/ 2727 batches | lr 1.54 | ms/batch 53.56 | loss 3.78 | ppl 43.96\n" + "| epoch 16 | 2000/ 3181 batches | lr 2.32 | ms/batch 61.40 | loss 4.17 | ppl 64.69\n", + "| epoch 16 | 2200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.15 | ppl 63.70\n", + "| epoch 16 | 2400/ 3181 batches | lr 2.32 | ms/batch 61.36 | loss 4.17 | ppl 64.52\n", + "| epoch 16 | 2600/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.11 | ppl 60.85\n", + "| epoch 16 | 2800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.21\n", + "| epoch 16 | 3000/ 3181 batches | lr 2.32 | ms/batch 61.39 | loss 4.09 | ppl 59.76\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 16 | time: 206.63s | valid loss 5.50 | valid ppl 243.52\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 17 | 200/ 3181 batches | lr 2.20 | ms/batch 61.74 | loss 4.21 | ppl 67.46\n", + "| epoch 17 | 400/ 3181 batches | lr 2.20 | ms/batch 61.43 | loss 4.14 | ppl 62.91\n", + "| epoch 17 | 600/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 61.89\n", + "| epoch 17 | 800/ 3181 batches | lr 2.20 | ms/batch 61.44 | loss 4.15 | ppl 63.38\n", + "| epoch 17 | 1000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.17 | ppl 64.88\n", + "| epoch 17 | 1200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 62.19\n", + "| epoch 17 | 1400/ 3181 batches | lr 2.20 | ms/batch 61.38 | loss 4.15 | ppl 63.41\n", + "| epoch 17 | 1600/ 3181 batches | lr 2.20 | ms/batch 61.37 | loss 4.13 | ppl 62.14\n", + "| epoch 17 | 1800/ 3181 batches | lr 2.20 | ms/batch 61.40 | loss 4.15 | ppl 63.28\n", + "| epoch 17 | 2000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.12 | ppl 61.53\n", + "| epoch 17 | 2200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.10 | ppl 60.52\n", + "| epoch 17 | 2400/ 3181 batches | lr 2.20 | ms/batch 61.45 | loss 4.11 | ppl 61.21\n", + "| epoch 17 | 2600/ 3181 batches | lr 2.20 | ms/batch 61.33 | loss 4.06 | ppl 58.01\n", + "| epoch 17 | 2800/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.14 | ppl 62.99\n", + "| epoch 17 | 3000/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.04 | ppl 56.98\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 17 | time: 206.66s | valid loss 5.51 | valid ppl 245.93\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 18 | 200/ 3181 batches | lr 2.09 | ms/batch 61.64 | loss 4.16 | ppl 64.27\n", + "| epoch 18 | 400/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.09 | ppl 59.95\n", + "| epoch 18 | 600/ 3181 batches | lr 2.09 | ms/batch 61.38 | loss 4.08 | ppl 58.99\n", + "| epoch 18 | 800/ 3181 batches | lr 2.09 | ms/batch 61.31 | loss 4.10 | ppl 60.18\n", + "| epoch 18 | 1000/ 3181 batches | lr 2.09 | ms/batch 61.37 | loss 4.12 | ppl 61.79\n", + "| epoch 18 | 1200/ 3181 batches | lr 2.09 | ms/batch 61.42 | loss 4.08 | ppl 58.92\n", + "| epoch 18 | 1400/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.10 | ppl 60.40\n", + "| epoch 18 | 1600/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.08 | ppl 59.34\n", + "| epoch 18 | 1800/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.09 | ppl 59.74\n", + "| epoch 18 | 2000/ 3181 batches | lr 2.09 | ms/batch 61.39 | loss 4.07 | ppl 58.43\n", + "| epoch 18 | 2200/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.06 | ppl 58.17\n", + "| epoch 18 | 2400/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.07 | ppl 58.27\n", + "| epoch 18 | 2600/ 3181 batches | lr 2.09 | ms/batch 61.35 | loss 4.01 | ppl 55.01\n", + "| epoch 18 | 2800/ 3181 batches | lr 2.09 | ms/batch 61.40 | loss 4.09 | ppl 59.91\n", + "| epoch 18 | 3000/ 3181 batches | lr 2.09 | ms/batch 61.34 | loss 4.00 | ppl 54.82\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 18 | time: 206.65s | valid loss 5.52 | valid ppl 248.66\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 19 | 200/ 3181 batches | lr 1.99 | ms/batch 61.63 | loss 4.12 | ppl 61.26\n", + "| epoch 19 | 400/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.04 | ppl 57.10\n", + "| epoch 19 | 600/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.18\n", + "| epoch 19 | 800/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.06 | ppl 57.74\n", + "| epoch 19 | 1000/ 3181 batches | lr 1.99 | ms/batch 61.49 | loss 4.08 | ppl 59.10\n", + "| epoch 19 | 1200/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.27\n", + "| epoch 19 | 1400/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.06 | ppl 57.99\n", + "| epoch 19 | 1600/ 3181 batches | lr 1.99 | ms/batch 61.40 | loss 4.04 | ppl 56.78\n", + "| epoch 19 | 1800/ 3181 batches | lr 1.99 | ms/batch 61.39 | loss 4.05 | ppl 57.32\n", + "| epoch 19 | 2000/ 3181 batches | lr 1.99 | ms/batch 61.43 | loss 4.03 | ppl 56.16\n", + "| epoch 19 | 2200/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.02 | ppl 55.62\n", + "| epoch 19 | 2400/ 3181 batches | lr 1.99 | ms/batch 61.42 | loss 4.02 | ppl 55.68\n", + "| epoch 19 | 2600/ 3181 batches | lr 1.99 | ms/batch 61.38 | loss 3.97 | ppl 52.86\n", + "| epoch 19 | 2800/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.05 | ppl 57.12\n", + "| epoch 19 | 3000/ 3181 batches | lr 1.99 | ms/batch 61.31 | loss 3.95 | ppl 52.08\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 19 | time: 206.62s | valid loss 5.55 | valid ppl 257.12\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 20 | 200/ 3181 batches | lr 1.89 | ms/batch 61.70 | loss 4.07 | ppl 58.59\n", + "| epoch 20 | 400/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 4.01 | ppl 55.07\n", + "| epoch 20 | 600/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 3.99 | ppl 53.82\n", + "| epoch 20 | 800/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 4.01 | ppl 55.29\n", + "| epoch 20 | 1000/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.04 | ppl 56.83\n", + "| epoch 20 | 1200/ 3181 batches | lr 1.89 | ms/batch 61.34 | loss 3.99 | ppl 54.01\n", + "| epoch 20 | 1400/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.02 | ppl 55.48\n", + "| epoch 20 | 1600/ 3181 batches | lr 1.89 | ms/batch 61.33 | loss 4.00 | ppl 54.51\n", + "| epoch 20 | 1800/ 3181 batches | lr 1.89 | ms/batch 61.41 | loss 4.01 | ppl 55.02\n", + "| epoch 20 | 2000/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 3.99 | ppl 54.00\n", + "| epoch 20 | 2200/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.97 | ppl 53.23\n", + "| epoch 20 | 2400/ 3181 batches | lr 1.89 | ms/batch 61.29 | loss 3.98 | ppl 53.61\n", + "| epoch 20 | 2600/ 3181 batches | lr 1.89 | ms/batch 61.30 | loss 3.92 | ppl 50.62\n", + "| epoch 20 | 2800/ 3181 batches | lr 1.89 | ms/batch 61.32 | loss 4.01 | ppl 55.04\n", + "| epoch 20 | 3000/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.92 | ppl 50.18\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 20 | time: 206.60s | valid loss 5.61 | valid ppl 273.93\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 21 | 200/ 3181 batches | lr 1.79 | ms/batch 61.65 | loss 4.03 | ppl 56.37\n", + "| epoch 21 | 400/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.96 | ppl 52.65\n", + "| epoch 21 | 600/ 3181 batches | lr 1.79 | ms/batch 61.43 | loss 3.94 | ppl 51.53\n", + "| epoch 21 | 800/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.97 | ppl 52.82\n", + "| epoch 21 | 1000/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.99 | ppl 54.28\n", + "| epoch 21 | 1200/ 3181 batches | lr 1.79 | ms/batch 61.31 | loss 3.95 | ppl 51.85\n", + "| epoch 21 | 1400/ 3181 batches | lr 1.79 | ms/batch 61.33 | loss 3.98 | ppl 53.51\n", + "| epoch 21 | 1600/ 3181 batches | lr 1.79 | ms/batch 61.37 | loss 3.96 | ppl 52.23\n", + "| epoch 21 | 1800/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.97 | ppl 52.95\n", + "| epoch 21 | 2000/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.95 | ppl 51.71\n", + "| epoch 21 | 2200/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.94 | ppl 51.19\n", + "| epoch 21 | 2400/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.94 | ppl 51.57\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 24 | 1000/ 2727 batches | lr 1.54 | ms/batch 53.58 | loss 3.75 | ppl 42.47\n", - "| epoch 24 | 1200/ 2727 batches | lr 1.54 | ms/batch 53.57 | loss 3.78 | ppl 44.00\n", - "| epoch 24 | 1400/ 2727 batches | lr 1.54 | ms/batch 53.59 | loss 3.74 | ppl 42.28\n", - "| epoch 24 | 1600/ 2727 batches | lr 1.54 | ms/batch 53.57 | loss 3.72 | ppl 41.13\n", - "| epoch 24 | 1800/ 2727 batches | lr 1.54 | ms/batch 53.57 | loss 3.76 | ppl 43.08\n", - "| epoch 24 | 2000/ 2727 batches | lr 1.54 | ms/batch 53.54 | loss 3.76 | ppl 42.81\n", - "| epoch 24 | 2200/ 2727 batches | lr 1.54 | ms/batch 53.55 | loss 3.77 | ppl 43.35\n", - "| epoch 24 | 2400/ 2727 batches | lr 1.54 | ms/batch 53.57 | loss 3.77 | ppl 43.25\n", - "| epoch 24 | 2600/ 2727 batches | lr 1.54 | ms/batch 53.56 | loss 3.77 | ppl 43.53\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 24 | time: 152.91s | valid loss 5.60 | valid ppl 271.38\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 25 | 200/ 2727 batches | lr 1.46 | ms/batch 53.86 | loss 3.83 | ppl 45.88\n", - "| epoch 25 | 400/ 2727 batches | lr 1.46 | ms/batch 53.56 | loss 3.76 | ppl 42.91\n", - "| epoch 25 | 600/ 2727 batches | lr 1.46 | ms/batch 53.55 | loss 3.76 | ppl 42.82\n", - "| epoch 25 | 800/ 2727 batches | lr 1.46 | ms/batch 53.54 | loss 3.75 | ppl 42.46\n", - "| epoch 25 | 1000/ 2727 batches | lr 1.46 | ms/batch 53.56 | loss 3.72 | ppl 41.09\n", - "| epoch 25 | 1200/ 2727 batches | lr 1.46 | ms/batch 53.53 | loss 3.75 | ppl 42.47\n", - "| epoch 25 | 1400/ 2727 batches | lr 1.46 | ms/batch 53.56 | loss 3.71 | ppl 40.83\n", - "| epoch 25 | 1600/ 2727 batches | lr 1.46 | ms/batch 53.56 | loss 3.69 | ppl 39.91\n", - "| epoch 25 | 1800/ 2727 batches | lr 1.46 | ms/batch 53.55 | loss 3.73 | ppl 41.62\n", - "| epoch 25 | 2000/ 2727 batches | lr 1.46 | ms/batch 53.55 | loss 3.72 | ppl 41.31\n", - "| epoch 25 | 2200/ 2727 batches | lr 1.46 | ms/batch 53.53 | loss 3.73 | ppl 41.76\n", - "| epoch 25 | 2400/ 2727 batches | lr 1.46 | ms/batch 53.52 | loss 3.74 | ppl 41.91\n", - "| epoch 25 | 2600/ 2727 batches | lr 1.46 | ms/batch 53.56 | loss 3.74 | ppl 42.25\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 25 | time: 152.86s | valid loss 5.63 | valid ppl 278.62\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 26 | 200/ 2727 batches | lr 1.39 | ms/batch 53.85 | loss 3.79 | ppl 44.36\n", - "| epoch 26 | 400/ 2727 batches | lr 1.39 | ms/batch 53.53 | loss 3.72 | ppl 41.40\n", - "| epoch 26 | 600/ 2727 batches | lr 1.39 | ms/batch 53.55 | loss 3.73 | ppl 41.71\n", - "| epoch 26 | 800/ 2727 batches | lr 1.39 | ms/batch 53.56 | loss 3.72 | ppl 41.34\n", - "| epoch 26 | 1000/ 2727 batches | lr 1.39 | ms/batch 53.51 | loss 3.69 | ppl 39.94\n", - "| epoch 26 | 1200/ 2727 batches | lr 1.39 | ms/batch 53.54 | loss 3.71 | ppl 41.05\n", - "| epoch 26 | 1400/ 2727 batches | lr 1.39 | ms/batch 53.58 | loss 3.68 | ppl 39.64\n", - "| epoch 26 | 1600/ 2727 batches | lr 1.39 | ms/batch 53.57 | loss 3.66 | ppl 38.83\n", - "| epoch 26 | 1800/ 2727 batches | lr 1.39 | ms/batch 53.56 | loss 3.69 | ppl 40.20\n", - "| epoch 26 | 2000/ 2727 batches | lr 1.39 | ms/batch 53.59 | loss 3.69 | ppl 40.13\n", - "| epoch 26 | 2200/ 2727 batches | lr 1.39 | ms/batch 53.53 | loss 3.70 | ppl 40.57\n", - "| epoch 26 | 2400/ 2727 batches | lr 1.39 | ms/batch 53.56 | loss 3.70 | ppl 40.39\n", - "| epoch 26 | 2600/ 2727 batches | lr 1.39 | ms/batch 53.56 | loss 3.72 | ppl 41.18\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 26 | time: 152.88s | valid loss 5.68 | valid ppl 291.76\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 27 | 200/ 2727 batches | lr 1.32 | ms/batch 53.83 | loss 3.76 | ppl 43.16\n", - "| epoch 27 | 400/ 2727 batches | lr 1.32 | ms/batch 53.57 | loss 3.69 | ppl 40.11\n", - "| epoch 27 | 600/ 2727 batches | lr 1.32 | ms/batch 53.55 | loss 3.69 | ppl 40.24\n", - "| epoch 27 | 800/ 2727 batches | lr 1.32 | ms/batch 53.55 | loss 3.69 | ppl 40.17\n", - "| epoch 27 | 1000/ 2727 batches | lr 1.32 | ms/batch 53.59 | loss 3.66 | ppl 38.70\n", - "| epoch 27 | 1200/ 2727 batches | lr 1.32 | ms/batch 53.53 | loss 3.69 | ppl 39.91\n", - "| epoch 27 | 1400/ 2727 batches | lr 1.32 | ms/batch 53.53 | loss 3.65 | ppl 38.30\n", - "| epoch 27 | 1600/ 2727 batches | lr 1.32 | ms/batch 53.58 | loss 3.63 | ppl 37.70\n", - "| epoch 27 | 1800/ 2727 batches | lr 1.32 | ms/batch 53.57 | loss 3.67 | ppl 39.13\n", - "| epoch 27 | 2000/ 2727 batches | lr 1.32 | ms/batch 53.53 | loss 3.66 | ppl 38.95\n", - "| epoch 27 | 2200/ 2727 batches | lr 1.32 | ms/batch 53.58 | loss 3.67 | ppl 39.37\n", - "| epoch 27 | 2400/ 2727 batches | lr 1.32 | ms/batch 53.57 | loss 3.68 | ppl 39.50\n", - "| epoch 27 | 2600/ 2727 batches | lr 1.32 | ms/batch 53.55 | loss 3.68 | ppl 39.59\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 27 | time: 152.91s | valid loss 5.66 | valid ppl 286.29\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 28 | 200/ 2727 batches | lr 1.25 | ms/batch 53.80 | loss 3.73 | ppl 41.62\n", - "| epoch 28 | 400/ 2727 batches | lr 1.25 | ms/batch 53.58 | loss 3.67 | ppl 39.16\n", - "| epoch 28 | 600/ 2727 batches | lr 1.25 | ms/batch 53.56 | loss 3.67 | ppl 39.34\n", - "| epoch 28 | 800/ 2727 batches | lr 1.25 | ms/batch 53.56 | loss 3.67 | ppl 39.07\n", - "| epoch 28 | 1000/ 2727 batches | lr 1.25 | ms/batch 53.59 | loss 3.63 | ppl 37.65\n", - "| epoch 28 | 1200/ 2727 batches | lr 1.25 | ms/batch 53.60 | loss 3.66 | ppl 38.76\n", - "| epoch 28 | 1400/ 2727 batches | lr 1.25 | ms/batch 53.57 | loss 3.62 | ppl 37.52\n", - "| epoch 28 | 1600/ 2727 batches | lr 1.25 | ms/batch 53.61 | loss 3.60 | ppl 36.77\n", - "| epoch 28 | 1800/ 2727 batches | lr 1.25 | ms/batch 53.54 | loss 3.64 | ppl 38.03\n", - "| epoch 28 | 2000/ 2727 batches | lr 1.25 | ms/batch 53.55 | loss 3.63 | ppl 37.81\n", - "| epoch 28 | 2200/ 2727 batches | lr 1.25 | ms/batch 53.56 | loss 3.65 | ppl 38.44\n", - "| epoch 28 | 2400/ 2727 batches | lr 1.25 | ms/batch 53.54 | loss 3.65 | ppl 38.44\n", - "| epoch 28 | 2600/ 2727 batches | lr 1.25 | ms/batch 53.54 | loss 3.66 | ppl 39.04\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 28 | time: 152.89s | valid loss 5.65 | valid ppl 285.63\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 29 | 200/ 2727 batches | lr 1.19 | ms/batch 53.80 | loss 3.70 | ppl 40.64\n", - "| epoch 29 | 400/ 2727 batches | lr 1.19 | ms/batch 53.52 | loss 3.64 | ppl 38.25\n", - "| epoch 29 | 600/ 2727 batches | lr 1.19 | ms/batch 53.56 | loss 3.64 | ppl 38.23\n", - "| epoch 29 | 800/ 2727 batches | lr 1.19 | ms/batch 53.58 | loss 3.63 | ppl 37.88\n", - "| epoch 29 | 1000/ 2727 batches | lr 1.19 | ms/batch 53.50 | loss 3.60 | ppl 36.69\n", - "| epoch 29 | 1200/ 2727 batches | lr 1.19 | ms/batch 53.55 | loss 3.64 | ppl 37.91\n", - "| epoch 29 | 1400/ 2727 batches | lr 1.19 | ms/batch 53.55 | loss 3.60 | ppl 36.52\n", - "| epoch 29 | 1600/ 2727 batches | lr 1.19 | ms/batch 53.54 | loss 3.58 | ppl 35.87\n", - "| epoch 29 | 1800/ 2727 batches | lr 1.19 | ms/batch 53.54 | loss 3.61 | ppl 36.97\n", - "| epoch 29 | 2000/ 2727 batches | lr 1.19 | ms/batch 53.50 | loss 3.61 | ppl 36.95\n", - "| epoch 29 | 2200/ 2727 batches | lr 1.19 | ms/batch 53.53 | loss 3.62 | ppl 37.32\n", - "| epoch 29 | 2400/ 2727 batches | lr 1.19 | ms/batch 53.54 | loss 3.62 | ppl 37.20\n", - "| epoch 29 | 2600/ 2727 batches | lr 1.19 | ms/batch 53.50 | loss 3.64 | ppl 37.98\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 29 | time: 152.82s | valid loss 5.69 | valid ppl 294.94\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 30 | 200/ 2727 batches | lr 1.13 | ms/batch 53.83 | loss 3.68 | ppl 39.81\n" + "| epoch 21 | 2600/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.88 | ppl 48.60\n", + "| epoch 21 | 2800/ 3181 batches | lr 1.79 | ms/batch 61.40 | loss 3.97 | ppl 52.99\n", + "| epoch 21 | 3000/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.87 | ppl 48.17\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 21 | time: 206.61s | valid loss 5.61 | valid ppl 273.11\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 22 | 200/ 3181 batches | lr 1.70 | ms/batch 61.70 | loss 3.99 | ppl 54.02\n", + "| epoch 22 | 400/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.92 | ppl 50.52\n", + "| epoch 22 | 600/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.90 | ppl 49.61\n", + "| epoch 22 | 800/ 3181 batches | lr 1.70 | ms/batch 61.33 | loss 3.93 | ppl 51.15\n", + "| epoch 22 | 1000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.96 | ppl 52.34\n", + "| epoch 22 | 1200/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 50.10\n", + "| epoch 22 | 1400/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.94 | ppl 51.37\n", + "| epoch 22 | 1600/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.92 | ppl 50.25\n", + "| epoch 22 | 1800/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.93 | ppl 50.89\n", + "| epoch 22 | 2000/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 49.70\n", + "| epoch 22 | 2200/ 3181 batches | lr 1.70 | ms/batch 61.43 | loss 3.90 | ppl 49.28\n", + "| epoch 22 | 2400/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.90 | ppl 49.46\n", + "| epoch 22 | 2600/ 3181 batches | lr 1.70 | ms/batch 61.41 | loss 3.84 | ppl 46.62\n", + "| epoch 22 | 2800/ 3181 batches | lr 1.70 | ms/batch 61.38 | loss 3.93 | ppl 50.75\n", + "| epoch 22 | 3000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.83 | ppl 46.27\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 22 | time: 206.60s | valid loss 5.61 | valid ppl 273.57\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 23 | 200/ 3181 batches | lr 1.62 | ms/batch 61.61 | loss 3.96 | ppl 52.31\n", + "| epoch 23 | 400/ 3181 batches | lr 1.62 | ms/batch 61.32 | loss 3.88 | ppl 48.56\n", + "| epoch 23 | 600/ 3181 batches | lr 1.62 | ms/batch 61.35 | loss 3.86 | ppl 47.70\n", + "| epoch 23 | 800/ 3181 batches | lr 1.62 | ms/batch 61.31 | loss 3.90 | ppl 49.41\n", + "| epoch 23 | 1000/ 3181 batches | lr 1.62 | ms/batch 61.41 | loss 3.92 | ppl 50.42\n", + "| epoch 23 | 1200/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.88 | ppl 48.43\n", + "| epoch 23 | 1400/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.91 | ppl 49.85\n", + "| epoch 23 | 1600/ 3181 batches | lr 1.62 | ms/batch 61.30 | loss 3.88 | ppl 48.37\n", + "| epoch 23 | 1800/ 3181 batches | lr 1.62 | ms/batch 61.34 | loss 3.89 | ppl 49.03\n", + "| epoch 23 | 2000/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.87 | ppl 48.12\n", + "| epoch 23 | 2200/ 3181 batches | lr 1.62 | ms/batch 61.36 | loss 3.86 | ppl 47.57\n", + "| epoch 23 | 2400/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.87 | ppl 47.73\n", + "| epoch 23 | 2600/ 3181 batches | lr 1.62 | ms/batch 61.29 | loss 3.81 | ppl 45.15\n", + "| epoch 23 | 2800/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.90 | ppl 49.58\n", + "| epoch 23 | 3000/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.80 | ppl 44.75\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 23 | time: 206.56s | valid loss 5.64 | valid ppl 281.95\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 24 | 200/ 3181 batches | lr 1.54 | ms/batch 61.67 | loss 3.92 | ppl 50.35\n", + "| epoch 24 | 400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.85 | ppl 47.01\n", + "| epoch 24 | 600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.84 | ppl 46.34\n", + "| epoch 24 | 800/ 3181 batches | lr 1.54 | ms/batch 61.43 | loss 3.87 | ppl 47.90\n", + "| epoch 24 | 1000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.89 | ppl 48.81\n", + "| epoch 24 | 1200/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.83\n", + "| epoch 24 | 1400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.87 | ppl 48.14\n", + "| epoch 24 | 1600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.96\n", + "| epoch 24 | 1800/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.86 | ppl 47.49\n", + "| epoch 24 | 2000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.84 | ppl 46.41\n", + "| epoch 24 | 2200/ 3181 batches | lr 1.54 | ms/batch 61.31 | loss 3.82 | ppl 45.83\n", + "| epoch 24 | 2400/ 3181 batches | lr 1.54 | ms/batch 61.35 | loss 3.83 | ppl 46.13\n", + "| epoch 24 | 2600/ 3181 batches | lr 1.54 | ms/batch 61.36 | loss 3.77 | ppl 43.56\n", + "| epoch 24 | 2800/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.86 | ppl 47.52\n", + "| epoch 24 | 3000/ 3181 batches | lr 1.54 | ms/batch 61.29 | loss 3.77 | ppl 43.23\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 24 | time: 206.67s | valid loss 5.67 | valid ppl 290.25\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 25 | 200/ 3181 batches | lr 1.46 | ms/batch 61.66 | loss 3.89 | ppl 48.76\n", + "| epoch 25 | 400/ 3181 batches | lr 1.46 | ms/batch 61.41 | loss 3.82 | ppl 45.61\n", + "| epoch 25 | 600/ 3181 batches | lr 1.46 | ms/batch 61.44 | loss 3.80 | ppl 44.79\n", + "| epoch 25 | 800/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.83 | ppl 46.26\n", + "| epoch 25 | 1000/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.86 | ppl 47.26\n", + "| epoch 25 | 1200/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.81 | ppl 45.19\n", + "| epoch 25 | 1400/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.84 | ppl 46.37\n", + "| epoch 25 | 1600/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.82 | ppl 45.47\n", + "| epoch 25 | 1800/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.83 | ppl 45.88\n", + "| epoch 25 | 2000/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.81 | ppl 45.08\n", + "| epoch 25 | 2200/ 3181 batches | lr 1.46 | ms/batch 61.43 | loss 3.80 | ppl 44.56\n", + "| epoch 25 | 2400/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.80 | ppl 44.78\n", + "| epoch 25 | 2600/ 3181 batches | lr 1.46 | ms/batch 61.34 | loss 3.74 | ppl 42.12\n", + "| epoch 25 | 2800/ 3181 batches | lr 1.46 | ms/batch 61.31 | loss 3.83 | ppl 45.90\n", + "| epoch 25 | 3000/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.74 | ppl 42.12\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 25 | time: 206.63s | valid loss 5.65 | valid ppl 283.82\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 26 | 200/ 3181 batches | lr 1.39 | ms/batch 61.69 | loss 3.86 | ppl 47.46\n", + "| epoch 26 | 400/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.79 | ppl 44.15\n", + "| epoch 26 | 600/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.77 | ppl 43.51\n", + "| epoch 26 | 800/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.08\n", + "| epoch 26 | 1000/ 3181 batches | lr 1.39 | ms/batch 61.42 | loss 3.82 | ppl 45.75\n", + "| epoch 26 | 1200/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.78 | ppl 43.98\n", + "| epoch 26 | 1400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.28\n", + "| epoch 26 | 1600/ 3181 batches | lr 1.39 | ms/batch 61.28 | loss 3.78 | ppl 43.92\n", + "| epoch 26 | 1800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.57\n", + "| epoch 26 | 2000/ 3181 batches | lr 1.39 | ms/batch 61.38 | loss 3.77 | ppl 43.55\n", + "| epoch 26 | 2200/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.77 | ppl 43.27\n", + "| epoch 26 | 2400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.77 | ppl 43.43\n", + "| epoch 26 | 2600/ 3181 batches | lr 1.39 | ms/batch 61.41 | loss 3.71 | ppl 40.92\n", + "| epoch 26 | 2800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.73\n", + "| epoch 26 | 3000/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.71 | ppl 40.74\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 30 | 400/ 2727 batches | lr 1.13 | ms/batch 53.55 | loss 3.61 | ppl 37.10\n", - "| epoch 30 | 600/ 2727 batches | lr 1.13 | ms/batch 53.51 | loss 3.62 | ppl 37.45\n", - "| epoch 30 | 800/ 2727 batches | lr 1.13 | ms/batch 53.54 | loss 3.61 | ppl 36.94\n", - "| epoch 30 | 1000/ 2727 batches | lr 1.13 | ms/batch 53.56 | loss 3.58 | ppl 35.89\n", - "| epoch 30 | 1200/ 2727 batches | lr 1.13 | ms/batch 53.56 | loss 3.61 | ppl 36.84\n", - "| epoch 30 | 1400/ 2727 batches | lr 1.13 | ms/batch 53.54 | loss 3.57 | ppl 35.52\n", - "| epoch 30 | 1600/ 2727 batches | lr 1.13 | ms/batch 53.56 | loss 3.56 | ppl 35.13\n", - "| epoch 30 | 1800/ 2727 batches | lr 1.13 | ms/batch 53.55 | loss 3.58 | ppl 36.04\n", - "| epoch 30 | 2000/ 2727 batches | lr 1.13 | ms/batch 53.57 | loss 3.58 | ppl 35.96\n", - "| epoch 30 | 2200/ 2727 batches | lr 1.13 | ms/batch 53.55 | loss 3.60 | ppl 36.52\n", - "| epoch 30 | 2400/ 2727 batches | lr 1.13 | ms/batch 53.55 | loss 3.60 | ppl 36.44\n", - "| epoch 30 | 2600/ 2727 batches | lr 1.13 | ms/batch 53.55 | loss 3.61 | ppl 36.99\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 30 | time: 152.86s | valid loss 5.71 | valid ppl 303.11\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 31 | 200/ 2727 batches | lr 1.07 | ms/batch 53.78 | loss 3.66 | ppl 38.97\n", - "| epoch 31 | 400/ 2727 batches | lr 1.07 | ms/batch 53.54 | loss 3.59 | ppl 36.20\n", - "| epoch 31 | 600/ 2727 batches | lr 1.07 | ms/batch 53.56 | loss 3.60 | ppl 36.42\n", - "| epoch 31 | 800/ 2727 batches | lr 1.07 | ms/batch 53.58 | loss 3.59 | ppl 36.21\n", - "| epoch 31 | 1000/ 2727 batches | lr 1.07 | ms/batch 53.54 | loss 3.56 | ppl 35.05\n", - "| epoch 31 | 1200/ 2727 batches | lr 1.07 | ms/batch 53.55 | loss 3.58 | ppl 35.99\n", - "| epoch 31 | 1400/ 2727 batches | lr 1.07 | ms/batch 53.57 | loss 3.55 | ppl 34.82\n", - "| epoch 31 | 1600/ 2727 batches | lr 1.07 | ms/batch 53.57 | loss 3.54 | ppl 34.43\n", - "| epoch 31 | 1800/ 2727 batches | lr 1.07 | ms/batch 53.54 | loss 3.56 | ppl 35.21\n", - "| epoch 31 | 2000/ 2727 batches | lr 1.07 | ms/batch 53.58 | loss 3.56 | ppl 35.07\n", - "| epoch 31 | 2200/ 2727 batches | lr 1.07 | ms/batch 53.57 | loss 3.57 | ppl 35.66\n", - "| epoch 31 | 2400/ 2727 batches | lr 1.07 | ms/batch 53.54 | loss 3.57 | ppl 35.53\n", - "| epoch 31 | 2600/ 2727 batches | lr 1.07 | ms/batch 53.54 | loss 3.58 | ppl 35.99\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 31 | time: 152.87s | valid loss 5.71 | valid ppl 300.48\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 32 | 200/ 2727 batches | lr 1.02 | ms/batch 53.82 | loss 3.64 | ppl 38.07\n", - "| epoch 32 | 400/ 2727 batches | lr 1.02 | ms/batch 53.55 | loss 3.57 | ppl 35.65\n", - "| epoch 32 | 600/ 2727 batches | lr 1.02 | ms/batch 53.52 | loss 3.57 | ppl 35.54\n", - "| epoch 32 | 800/ 2727 batches | lr 1.02 | ms/batch 53.54 | loss 3.57 | ppl 35.46\n", - "| epoch 32 | 1000/ 2727 batches | lr 1.02 | ms/batch 53.56 | loss 3.53 | ppl 34.28\n", - "| epoch 32 | 1200/ 2727 batches | lr 1.02 | ms/batch 53.60 | loss 3.57 | ppl 35.40\n", - "| epoch 32 | 1400/ 2727 batches | lr 1.02 | ms/batch 53.52 | loss 3.53 | ppl 34.04\n", - "| epoch 32 | 1600/ 2727 batches | lr 1.02 | ms/batch 53.54 | loss 3.52 | ppl 33.70\n", - "| epoch 32 | 1800/ 2727 batches | lr 1.02 | ms/batch 53.53 | loss 3.54 | ppl 34.47\n", - "| epoch 32 | 2000/ 2727 batches | lr 1.02 | ms/batch 53.56 | loss 3.54 | ppl 34.34\n", - "| epoch 32 | 2200/ 2727 batches | lr 1.02 | ms/batch 53.54 | loss 3.55 | ppl 34.93\n", - "| epoch 32 | 2400/ 2727 batches | lr 1.02 | ms/batch 53.56 | loss 3.55 | ppl 34.84\n", - "| epoch 32 | 2600/ 2727 batches | lr 1.02 | ms/batch 53.50 | loss 3.57 | ppl 35.47\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 32 | time: 152.84s | valid loss 5.77 | valid ppl 319.05\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 33 | 200/ 2727 batches | lr 0.97 | ms/batch 53.81 | loss 3.61 | ppl 37.13\n", - "| epoch 33 | 400/ 2727 batches | lr 0.97 | ms/batch 53.56 | loss 3.55 | ppl 34.83\n", - "| epoch 33 | 600/ 2727 batches | lr 0.97 | ms/batch 53.53 | loss 3.55 | ppl 34.87\n", - "| epoch 33 | 800/ 2727 batches | lr 0.97 | ms/batch 53.55 | loss 3.55 | ppl 34.64\n", - "| epoch 33 | 1000/ 2727 batches | lr 0.97 | ms/batch 53.57 | loss 3.51 | ppl 33.61\n", - "| epoch 33 | 1200/ 2727 batches | lr 0.97 | ms/batch 53.59 | loss 3.54 | ppl 34.49\n", - "| epoch 33 | 1400/ 2727 batches | lr 0.97 | ms/batch 53.51 | loss 3.50 | ppl 33.19\n", - "| epoch 33 | 1600/ 2727 batches | lr 0.97 | ms/batch 53.60 | loss 3.50 | ppl 33.13\n", - "| epoch 33 | 1800/ 2727 batches | lr 0.97 | ms/batch 53.56 | loss 3.52 | ppl 33.70\n", - "| epoch 33 | 2000/ 2727 batches | lr 0.97 | ms/batch 53.54 | loss 3.52 | ppl 33.72\n", - "| epoch 33 | 2200/ 2727 batches | lr 0.97 | ms/batch 53.56 | loss 3.54 | ppl 34.31\n", - "| epoch 33 | 2400/ 2727 batches | lr 0.97 | ms/batch 53.56 | loss 3.53 | ppl 34.12\n", - "| epoch 33 | 2600/ 2727 batches | lr 0.97 | ms/batch 53.55 | loss 3.55 | ppl 34.69\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 33 | time: 152.88s | valid loss 5.75 | valid ppl 315.11\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 34 | 200/ 2727 batches | lr 0.92 | ms/batch 53.84 | loss 3.60 | ppl 36.43\n", - "| epoch 34 | 400/ 2727 batches | lr 0.92 | ms/batch 53.55 | loss 3.53 | ppl 34.17\n", - "| epoch 34 | 600/ 2727 batches | lr 0.92 | ms/batch 53.58 | loss 3.53 | ppl 34.24\n", - "| epoch 34 | 800/ 2727 batches | lr 0.92 | ms/batch 53.57 | loss 3.53 | ppl 34.12\n", - "| epoch 34 | 1000/ 2727 batches | lr 0.92 | ms/batch 53.56 | loss 3.49 | ppl 32.90\n", - "| epoch 34 | 1200/ 2727 batches | lr 0.92 | ms/batch 53.57 | loss 3.52 | ppl 33.84\n", - "| epoch 34 | 1400/ 2727 batches | lr 0.92 | ms/batch 53.58 | loss 3.49 | ppl 32.76\n", - "| epoch 34 | 1600/ 2727 batches | lr 0.92 | ms/batch 53.57 | loss 3.48 | ppl 32.41\n", - "| epoch 34 | 1800/ 2727 batches | lr 0.92 | ms/batch 53.58 | loss 3.49 | ppl 32.88\n", - "| epoch 34 | 2000/ 2727 batches | lr 0.92 | ms/batch 53.55 | loss 3.49 | ppl 32.92\n", - "| epoch 34 | 2200/ 2727 batches | lr 0.92 | ms/batch 53.58 | loss 3.51 | ppl 33.45\n", - "| epoch 34 | 2400/ 2727 batches | lr 0.92 | ms/batch 53.57 | loss 3.51 | ppl 33.37\n", - "| epoch 34 | 2600/ 2727 batches | lr 0.92 | ms/batch 53.55 | loss 3.52 | ppl 33.91\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 34 | time: 152.92s | valid loss 5.78 | valid ppl 322.80\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 35 | 200/ 2727 batches | lr 0.87 | ms/batch 53.84 | loss 3.58 | ppl 35.78\n", - "| epoch 35 | 400/ 2727 batches | lr 0.87 | ms/batch 53.56 | loss 3.51 | ppl 33.57\n", - "| epoch 35 | 600/ 2727 batches | lr 0.87 | ms/batch 53.59 | loss 3.51 | ppl 33.51\n", - "| epoch 35 | 800/ 2727 batches | lr 0.87 | ms/batch 53.59 | loss 3.50 | ppl 33.28\n", - "| epoch 35 | 1000/ 2727 batches | lr 0.87 | ms/batch 53.54 | loss 3.47 | ppl 32.18\n", - "| epoch 35 | 1200/ 2727 batches | lr 0.87 | ms/batch 53.59 | loss 3.51 | ppl 33.34\n", - "| epoch 35 | 1400/ 2727 batches | lr 0.87 | ms/batch 53.57 | loss 3.47 | ppl 31.99\n", - "| epoch 35 | 1600/ 2727 batches | lr 0.87 | ms/batch 53.57 | loss 3.46 | ppl 31.87\n", - "| epoch 35 | 1800/ 2727 batches | lr 0.87 | ms/batch 53.58 | loss 3.48 | ppl 32.39\n", - "| epoch 35 | 2000/ 2727 batches | lr 0.87 | ms/batch 53.59 | loss 3.47 | ppl 32.23\n", - "| epoch 35 | 2200/ 2727 batches | lr 0.87 | ms/batch 53.55 | loss 3.50 | ppl 33.00\n", - "| epoch 35 | 2400/ 2727 batches | lr 0.87 | ms/batch 53.56 | loss 3.49 | ppl 32.83\n", - "| epoch 35 | 2600/ 2727 batches | lr 0.87 | ms/batch 53.56 | loss 3.51 | ppl 33.51\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 35 | time: 152.93s | valid loss 5.77 | valid ppl 321.70\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 26 | time: 206.67s | valid loss 5.69 | valid ppl 294.72\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 27 | 200/ 3181 batches | lr 1.32 | ms/batch 61.61 | loss 3.83 | ppl 46.08\n", + "| epoch 27 | 400/ 3181 batches | lr 1.32 | ms/batch 61.34 | loss 3.76 | ppl 42.90\n", + "| epoch 27 | 600/ 3181 batches | lr 1.32 | ms/batch 61.37 | loss 3.73 | ppl 41.77\n", + "| epoch 27 | 800/ 3181 batches | lr 1.32 | ms/batch 61.39 | loss 3.78 | ppl 43.61\n", + "| epoch 27 | 1000/ 3181 batches | lr 1.32 | ms/batch 61.38 | loss 3.80 | ppl 44.57\n", + "| epoch 27 | 1200/ 3181 batches | lr 1.32 | ms/batch 61.31 | loss 3.75 | ppl 42.51\n", + "| epoch 27 | 1400/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.79 | ppl 44.18\n", + "| epoch 27 | 1600/ 3181 batches | lr 1.32 | ms/batch 61.30 | loss 3.76 | ppl 42.82\n", + "| epoch 27 | 1800/ 3181 batches | lr 1.32 | ms/batch 61.41 | loss 3.76 | ppl 42.95\n", + "| epoch 27 | 2000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.75 | ppl 42.42\n", + "| epoch 27 | 2200/ 3181 batches | lr 1.32 | ms/batch 61.35 | loss 3.74 | ppl 42.12\n", + "| epoch 27 | 2400/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.74 | ppl 42.31\n", + "| epoch 27 | 2600/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.68 | ppl 39.83\n", + "| epoch 27 | 2800/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.77 | ppl 43.28\n", + "| epoch 27 | 3000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.68 | ppl 39.55\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 27 | time: 206.56s | valid loss 5.75 | valid ppl 315.59\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 28 | 200/ 3181 batches | lr 1.25 | ms/batch 61.70 | loss 3.80 | ppl 44.70\n", + "| epoch 28 | 400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.73 | ppl 41.81\n", + "| epoch 28 | 600/ 3181 batches | lr 1.25 | ms/batch 61.43 | loss 3.71 | ppl 40.76\n", + "| epoch 28 | 800/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.75 | ppl 42.56\n", + "| epoch 28 | 1000/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.77 | ppl 43.35\n", + "| epoch 28 | 1200/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.72 | ppl 41.32\n", + "| epoch 28 | 1400/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.75 | ppl 42.65\n", + "| epoch 28 | 1600/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.73 | ppl 41.67\n", + "| epoch 28 | 1800/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.73 | ppl 41.85\n", + "| epoch 28 | 2000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.72 | ppl 41.24\n", + "| epoch 28 | 2200/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.71 | ppl 40.83\n", + "| epoch 28 | 2400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.72 | ppl 41.20\n", + "| epoch 28 | 2600/ 3181 batches | lr 1.25 | ms/batch 61.39 | loss 3.66 | ppl 38.72\n", + "| epoch 28 | 2800/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.74 | ppl 42.13\n", + "| epoch 28 | 3000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.65 | ppl 38.37\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 28 | time: 206.71s | valid loss 5.77 | valid ppl 320.59\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 29 | 200/ 3181 batches | lr 1.19 | ms/batch 61.73 | loss 3.77 | ppl 43.52\n", + "| epoch 29 | 400/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.70 | ppl 40.52\n", + "| epoch 29 | 600/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.68 | ppl 39.58\n", + "| epoch 29 | 800/ 3181 batches | lr 1.19 | ms/batch 61.48 | loss 3.72 | ppl 41.41\n", + "| epoch 29 | 1000/ 3181 batches | lr 1.19 | ms/batch 61.35 | loss 3.74 | ppl 42.29\n", + "| epoch 29 | 1200/ 3181 batches | lr 1.19 | ms/batch 61.36 | loss 3.70 | ppl 40.36\n", + "| epoch 29 | 1400/ 3181 batches | lr 1.19 | ms/batch 61.37 | loss 3.73 | ppl 41.64\n", + "| epoch 29 | 1600/ 3181 batches | lr 1.19 | ms/batch 61.40 | loss 3.71 | ppl 40.66\n", + "| epoch 29 | 1800/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.72 | ppl 41.08\n", + "| epoch 29 | 2000/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.69 | ppl 40.20\n", + "| epoch 29 | 2200/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.68 | ppl 39.80\n", + "| epoch 29 | 2400/ 3181 batches | lr 1.19 | ms/batch 61.45 | loss 3.70 | ppl 40.25\n", + "| epoch 29 | 2600/ 3181 batches | lr 1.19 | ms/batch 61.47 | loss 3.63 | ppl 37.79\n", + "| epoch 29 | 2800/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.72 | ppl 41.21\n", + "| epoch 29 | 3000/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.62 | ppl 37.43\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 29 | time: 206.79s | valid loss 5.81 | valid ppl 332.16\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 30 | 200/ 3181 batches | lr 1.13 | ms/batch 61.74 | loss 3.74 | ppl 42.22\n", + "| epoch 30 | 400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.68 | ppl 39.52\n", + "| epoch 30 | 600/ 3181 batches | lr 1.13 | ms/batch 61.41 | loss 3.65 | ppl 38.62\n", + "| epoch 30 | 800/ 3181 batches | lr 1.13 | ms/batch 61.39 | loss 3.70 | ppl 40.47\n", + "| epoch 30 | 1000/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.72 | ppl 41.14\n", + "| epoch 30 | 1200/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.67 | ppl 39.41\n", + "| epoch 30 | 1400/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.71 | ppl 40.66\n", + "| epoch 30 | 1600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.68 | ppl 39.62\n", + "| epoch 30 | 1800/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.69 | ppl 39.97\n", + "| epoch 30 | 2000/ 3181 batches | lr 1.13 | ms/batch 61.36 | loss 3.67 | ppl 39.34\n", + "| epoch 30 | 2200/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.66 | ppl 38.99\n", + "| epoch 30 | 2400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.66 | ppl 39.01\n", + "| epoch 30 | 2600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.61 | ppl 36.84\n", + "| epoch 30 | 2800/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.69 | ppl 40.20\n", + "| epoch 30 | 3000/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.60 | ppl 36.54\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 30 | time: 206.80s | valid loss 5.75 | valid ppl 313.98\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 31 | 200/ 3181 batches | lr 1.07 | ms/batch 61.74 | loss 3.72 | ppl 41.43\n", + "| epoch 31 | 400/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.65 | ppl 38.65\n", + "| epoch 31 | 600/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.63 | ppl 37.82\n", + "| epoch 31 | 800/ 3181 batches | lr 1.07 | ms/batch 61.40 | loss 3.68 | ppl 39.51\n", + "| epoch 31 | 1000/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.69 | ppl 40.17\n", + "| epoch 31 | 1200/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.65 | ppl 38.53\n", + "| epoch 31 | 1400/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.69 | ppl 39.93\n", + "| epoch 31 | 1600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.66 | ppl 38.77\n", + "| epoch 31 | 1800/ 3181 batches | lr 1.07 | ms/batch 61.39 | loss 3.67 | ppl 39.17\n", + "| epoch 31 | 2000/ 3181 batches | lr 1.07 | ms/batch 61.49 | loss 3.65 | ppl 38.48\n", + "| epoch 31 | 2200/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.63 | ppl 37.78\n", + "| epoch 31 | 2400/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.65 | ppl 38.35\n", + "| epoch 31 | 2600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.59 | ppl 36.09\n", + "| epoch 31 | 2800/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.67 | ppl 39.29\n", + "| epoch 31 | 3000/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.57 | ppl 35.60\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 31 | time: 206.68s | valid loss 5.82 | valid ppl 335.54\n", "-----------------------------------------------------------------------------------------\n" ] }, @@ -1283,257 +1287,365 @@ "name": "stdout", "output_type": "stream", "text": [ - "| epoch 36 | 200/ 2727 batches | lr 0.83 | ms/batch 53.83 | loss 3.56 | ppl 35.18\n", - "| epoch 36 | 400/ 2727 batches | lr 0.83 | ms/batch 53.51 | loss 3.49 | ppl 32.93\n", - "| epoch 36 | 600/ 2727 batches | lr 0.83 | ms/batch 53.53 | loss 3.50 | ppl 32.98\n", - "| epoch 36 | 800/ 2727 batches | lr 0.83 | ms/batch 53.58 | loss 3.49 | ppl 32.95\n", - "| epoch 36 | 1000/ 2727 batches | lr 0.83 | ms/batch 53.58 | loss 3.46 | ppl 31.83\n", - "| epoch 36 | 1200/ 2727 batches | lr 0.83 | ms/batch 53.57 | loss 3.49 | ppl 32.66\n", - "| epoch 36 | 1400/ 2727 batches | lr 0.83 | ms/batch 53.52 | loss 3.46 | ppl 31.67\n", - "| epoch 36 | 1600/ 2727 batches | lr 0.83 | ms/batch 53.55 | loss 3.45 | ppl 31.48\n", - "| epoch 36 | 1800/ 2727 batches | lr 0.83 | ms/batch 53.54 | loss 3.47 | ppl 31.98\n", - "| epoch 36 | 2000/ 2727 batches | lr 0.83 | ms/batch 53.54 | loss 3.46 | ppl 31.85\n", - "| epoch 36 | 2200/ 2727 batches | lr 0.83 | ms/batch 53.55 | loss 3.48 | ppl 32.55\n", - "| epoch 36 | 2400/ 2727 batches | lr 0.83 | ms/batch 53.54 | loss 3.48 | ppl 32.37\n", - "| epoch 36 | 2600/ 2727 batches | lr 0.83 | ms/batch 53.59 | loss 3.50 | ppl 32.99\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 36 | time: 152.86s | valid loss 5.79 | valid ppl 325.64\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 37 | 200/ 2727 batches | lr 0.79 | ms/batch 53.81 | loss 3.55 | ppl 34.66\n", - "| epoch 37 | 400/ 2727 batches | lr 0.79 | ms/batch 53.58 | loss 3.48 | ppl 32.32\n", - "| epoch 37 | 600/ 2727 batches | lr 0.79 | ms/batch 53.57 | loss 3.48 | ppl 32.49\n", - "| epoch 37 | 800/ 2727 batches | lr 0.79 | ms/batch 53.55 | loss 3.48 | ppl 32.35\n", - "| epoch 37 | 1000/ 2727 batches | lr 0.79 | ms/batch 53.53 | loss 3.44 | ppl 31.18\n", - "| epoch 37 | 1200/ 2727 batches | lr 0.79 | ms/batch 53.50 | loss 3.47 | ppl 32.21\n", - "| epoch 37 | 1400/ 2727 batches | lr 0.79 | ms/batch 53.50 | loss 3.44 | ppl 31.11\n", - "| epoch 37 | 1600/ 2727 batches | lr 0.79 | ms/batch 53.54 | loss 3.43 | ppl 30.90\n", - "| epoch 37 | 1800/ 2727 batches | lr 0.79 | ms/batch 53.56 | loss 3.44 | ppl 31.32\n", - "| epoch 37 | 2000/ 2727 batches | lr 0.79 | ms/batch 53.54 | loss 3.44 | ppl 31.32\n", - "| epoch 37 | 2200/ 2727 batches | lr 0.79 | ms/batch 53.57 | loss 3.46 | ppl 31.85\n", - "| epoch 37 | 2400/ 2727 batches | lr 0.79 | ms/batch 53.59 | loss 3.46 | ppl 31.68\n", - "| epoch 37 | 2600/ 2727 batches | lr 0.79 | ms/batch 53.53 | loss 3.48 | ppl 32.34\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 37 | time: 152.85s | valid loss 5.81 | valid ppl 332.87\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 38 | 200/ 2727 batches | lr 0.75 | ms/batch 53.83 | loss 3.53 | ppl 33.96\n", - "| epoch 38 | 400/ 2727 batches | lr 0.75 | ms/batch 53.58 | loss 3.46 | ppl 31.95\n", - "| epoch 38 | 600/ 2727 batches | lr 0.75 | ms/batch 53.57 | loss 3.46 | ppl 31.93\n", - "| epoch 38 | 800/ 2727 batches | lr 0.75 | ms/batch 53.56 | loss 3.46 | ppl 31.78\n", - "| epoch 38 | 1000/ 2727 batches | lr 0.75 | ms/batch 53.56 | loss 3.42 | ppl 30.67\n", - "| epoch 38 | 1200/ 2727 batches | lr 0.75 | ms/batch 53.57 | loss 3.45 | ppl 31.64\n", - "| epoch 38 | 1400/ 2727 batches | lr 0.75 | ms/batch 53.52 | loss 3.42 | ppl 30.59\n", - "| epoch 38 | 1600/ 2727 batches | lr 0.75 | ms/batch 53.58 | loss 3.42 | ppl 30.46\n", - "| epoch 38 | 1800/ 2727 batches | lr 0.75 | ms/batch 53.56 | loss 3.43 | ppl 30.89\n", - "| epoch 38 | 2000/ 2727 batches | lr 0.75 | ms/batch 53.55 | loss 3.43 | ppl 30.79\n", - "| epoch 38 | 2200/ 2727 batches | lr 0.75 | ms/batch 53.57 | loss 3.45 | ppl 31.52\n", - "| epoch 38 | 2400/ 2727 batches | lr 0.75 | ms/batch 53.53 | loss 3.45 | ppl 31.37\n", - "| epoch 38 | 2600/ 2727 batches | lr 0.75 | ms/batch 53.56 | loss 3.46 | ppl 31.90\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 38 | time: 152.91s | valid loss 5.85 | valid ppl 345.69\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 39 | 200/ 2727 batches | lr 0.71 | ms/batch 53.88 | loss 3.51 | ppl 33.48\n", - "| epoch 39 | 400/ 2727 batches | lr 0.71 | ms/batch 53.56 | loss 3.45 | ppl 31.43\n", - "| epoch 39 | 600/ 2727 batches | lr 0.71 | ms/batch 53.56 | loss 3.45 | ppl 31.60\n", - "| epoch 39 | 800/ 2727 batches | lr 0.71 | ms/batch 53.57 | loss 3.44 | ppl 31.28\n", - "| epoch 39 | 1000/ 2727 batches | lr 0.71 | ms/batch 53.57 | loss 3.42 | ppl 30.42\n", - "| epoch 39 | 1200/ 2727 batches | lr 0.71 | ms/batch 53.56 | loss 3.44 | ppl 31.29\n", - "| epoch 39 | 1400/ 2727 batches | lr 0.71 | ms/batch 53.56 | loss 3.41 | ppl 30.15\n", - "| epoch 39 | 1600/ 2727 batches | lr 0.71 | ms/batch 53.57 | loss 3.40 | ppl 30.06\n", - "| epoch 39 | 1800/ 2727 batches | lr 0.71 | ms/batch 53.57 | loss 3.42 | ppl 30.46\n", - "| epoch 39 | 2000/ 2727 batches | lr 0.71 | ms/batch 53.55 | loss 3.41 | ppl 30.38\n", - "| epoch 39 | 2200/ 2727 batches | lr 0.71 | ms/batch 53.57 | loss 3.43 | ppl 31.02\n", - "| epoch 39 | 2400/ 2727 batches | lr 0.71 | ms/batch 53.61 | loss 3.43 | ppl 30.88\n", - "| epoch 39 | 2600/ 2727 batches | lr 0.71 | ms/batch 53.58 | loss 3.44 | ppl 31.33\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 39 | time: 152.92s | valid loss 5.84 | valid ppl 343.47\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 40 | 200/ 2727 batches | lr 0.68 | ms/batch 53.87 | loss 3.50 | ppl 33.00\n", - "| epoch 40 | 400/ 2727 batches | lr 0.68 | ms/batch 53.57 | loss 3.44 | ppl 31.12\n", - "| epoch 40 | 600/ 2727 batches | lr 0.68 | ms/batch 53.54 | loss 3.44 | ppl 31.19\n", - "| epoch 40 | 800/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.43 | ppl 30.90\n", - "| epoch 40 | 1000/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.40 | ppl 29.94\n", - "| epoch 40 | 1200/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.43 | ppl 30.78\n", - "| epoch 40 | 1400/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.40 | ppl 29.83\n", - "| epoch 40 | 1600/ 2727 batches | lr 0.68 | ms/batch 53.52 | loss 3.39 | ppl 29.69\n", - "| epoch 40 | 1800/ 2727 batches | lr 0.68 | ms/batch 53.54 | loss 3.40 | ppl 30.10\n", - "| epoch 40 | 2000/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.40 | ppl 30.00\n", - "| epoch 40 | 2200/ 2727 batches | lr 0.68 | ms/batch 53.57 | loss 3.42 | ppl 30.59\n", - "| epoch 40 | 2400/ 2727 batches | lr 0.68 | ms/batch 53.56 | loss 3.42 | ppl 30.57\n", - "| epoch 40 | 2600/ 2727 batches | lr 0.68 | ms/batch 53.55 | loss 3.43 | ppl 30.90\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 40 | time: 152.88s | valid loss 5.83 | valid ppl 338.70\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 41 | 200/ 2727 batches | lr 0.64 | ms/batch 53.80 | loss 3.48 | ppl 32.57\n", - "| epoch 41 | 400/ 2727 batches | lr 0.64 | ms/batch 53.50 | loss 3.42 | ppl 30.64\n", - "| epoch 41 | 600/ 2727 batches | lr 0.64 | ms/batch 53.54 | loss 3.42 | ppl 30.65\n", - "| epoch 41 | 800/ 2727 batches | lr 0.64 | ms/batch 53.52 | loss 3.42 | ppl 30.49\n", - "| epoch 41 | 1000/ 2727 batches | lr 0.64 | ms/batch 53.56 | loss 3.39 | ppl 29.54\n", - "| epoch 41 | 1200/ 2727 batches | lr 0.64 | ms/batch 53.56 | loss 3.41 | ppl 30.40\n", - "| epoch 41 | 1400/ 2727 batches | lr 0.64 | ms/batch 53.54 | loss 3.38 | ppl 29.44\n", - "| epoch 41 | 1600/ 2727 batches | lr 0.64 | ms/batch 53.54 | loss 3.38 | ppl 29.33\n", - "| epoch 41 | 1800/ 2727 batches | lr 0.64 | ms/batch 53.54 | loss 3.39 | ppl 29.65\n", - "| epoch 41 | 2000/ 2727 batches | lr 0.64 | ms/batch 53.59 | loss 3.39 | ppl 29.57\n", - "| epoch 41 | 2200/ 2727 batches | lr 0.64 | ms/batch 53.55 | loss 3.41 | ppl 30.35\n", - "| epoch 41 | 2400/ 2727 batches | lr 0.64 | ms/batch 53.57 | loss 3.40 | ppl 30.11\n", - "| epoch 41 | 2600/ 2727 batches | lr 0.64 | ms/batch 53.53 | loss 3.42 | ppl 30.61\n" + "| epoch 32 | 200/ 3181 batches | lr 1.02 | ms/batch 61.77 | loss 3.70 | ppl 40.52\n", + "| epoch 32 | 400/ 3181 batches | lr 1.02 | ms/batch 61.35 | loss 3.64 | ppl 37.96\n", + "| epoch 32 | 600/ 3181 batches | lr 1.02 | ms/batch 61.39 | loss 3.61 | ppl 36.88\n", + "| epoch 32 | 800/ 3181 batches | lr 1.02 | ms/batch 61.41 | loss 3.66 | ppl 38.80\n", + "| epoch 32 | 1000/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.67 | ppl 39.32\n", + "| epoch 32 | 1200/ 3181 batches | lr 1.02 | ms/batch 61.42 | loss 3.63 | ppl 37.59\n", + "| epoch 32 | 1400/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.66 | ppl 38.96\n", + "| epoch 32 | 1600/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.64 | ppl 38.12\n", + "| epoch 32 | 1800/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.64 | ppl 38.28\n", + "| epoch 32 | 2000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.63 | ppl 37.70\n", + "| epoch 32 | 2200/ 3181 batches | lr 1.02 | ms/batch 61.37 | loss 3.61 | ppl 37.07\n", + "| epoch 32 | 2400/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.62 | ppl 37.39\n", + "| epoch 32 | 2600/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.56 | ppl 35.19\n", + "| epoch 32 | 2800/ 3181 batches | lr 1.02 | ms/batch 61.43 | loss 3.65 | ppl 38.29\n", + "| epoch 32 | 3000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.55 | ppl 34.89\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 32 | time: 206.72s | valid loss 5.81 | valid ppl 333.52\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 33 | 200/ 3181 batches | lr 0.97 | ms/batch 61.68 | loss 3.68 | ppl 39.71\n", + "| epoch 33 | 400/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 37.00\n", + "| epoch 33 | 600/ 3181 batches | lr 0.97 | ms/batch 61.36 | loss 3.59 | ppl 36.33\n", + "| epoch 33 | 800/ 3181 batches | lr 0.97 | ms/batch 61.35 | loss 3.64 | ppl 38.02\n", + "| epoch 33 | 1000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.65 | ppl 38.54\n", + "| epoch 33 | 1200/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.61 | ppl 37.12\n", + "| epoch 33 | 1400/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.64 | ppl 38.27\n", + "| epoch 33 | 1600/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.26\n", + "| epoch 33 | 1800/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.45\n", + "| epoch 33 | 2000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 36.92\n", + "| epoch 33 | 2200/ 3181 batches | lr 0.97 | ms/batch 61.37 | loss 3.59 | ppl 36.34\n", + "| epoch 33 | 2400/ 3181 batches | lr 0.97 | ms/batch 61.41 | loss 3.60 | ppl 36.73\n", + "| epoch 33 | 2600/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.54 | ppl 34.54\n", + "| epoch 33 | 2800/ 3181 batches | lr 0.97 | ms/batch 61.39 | loss 3.62 | ppl 37.42\n", + "| epoch 33 | 3000/ 3181 batches | lr 0.97 | ms/batch 61.45 | loss 3.53 | ppl 34.28\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 33 | time: 206.79s | valid loss 5.84 | valid ppl 345.08\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 34 | 200/ 3181 batches | lr 0.92 | ms/batch 61.72 | loss 3.66 | ppl 38.95\n", + "| epoch 34 | 400/ 3181 batches | lr 0.92 | ms/batch 61.44 | loss 3.60 | ppl 36.53\n", + "| epoch 34 | 600/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.57 | ppl 35.49\n", + "| epoch 34 | 800/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.62 | ppl 37.33\n", + "| epoch 34 | 1000/ 3181 batches | lr 0.92 | ms/batch 61.39 | loss 3.63 | ppl 37.79\n", + "| epoch 34 | 1200/ 3181 batches | lr 0.92 | ms/batch 61.34 | loss 3.59 | ppl 36.16\n", + "| epoch 34 | 1400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.63 | ppl 37.62\n", + "| epoch 34 | 1600/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.60 | ppl 36.58\n", + "| epoch 34 | 1800/ 3181 batches | lr 0.92 | ms/batch 61.37 | loss 3.60 | ppl 36.77\n", + "| epoch 34 | 2000/ 3181 batches | lr 0.92 | ms/batch 61.36 | loss 3.59 | ppl 36.25\n", + "| epoch 34 | 2200/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.58 | ppl 35.76\n", + "| epoch 34 | 2400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.59 | ppl 36.19\n", + "| epoch 34 | 2600/ 3181 batches | lr 0.92 | ms/batch 61.40 | loss 3.52 | ppl 33.87\n", + "| epoch 34 | 2800/ 3181 batches | lr 0.92 | ms/batch 61.31 | loss 3.61 | ppl 36.90\n", + "| epoch 34 | 3000/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.52 | ppl 33.68\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 34 | time: 206.73s | valid loss 5.83 | valid ppl 341.59\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 35 | 200/ 3181 batches | lr 0.87 | ms/batch 61.62 | loss 3.64 | ppl 38.22\n", + "| epoch 35 | 400/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.82\n", + "| epoch 35 | 600/ 3181 batches | lr 0.87 | ms/batch 61.36 | loss 3.55 | ppl 34.84\n", + "| epoch 35 | 800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.61 | ppl 36.83\n", + "| epoch 35 | 1000/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.62 | ppl 37.16\n", + "| epoch 35 | 1200/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.57 | ppl 35.54\n", + "| epoch 35 | 1400/ 3181 batches | lr 0.87 | ms/batch 61.35 | loss 3.60 | ppl 36.70\n", + "| epoch 35 | 1600/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.58 | ppl 35.97\n", + "| epoch 35 | 1800/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.94\n", + "| epoch 35 | 2000/ 3181 batches | lr 0.87 | ms/batch 61.48 | loss 3.57 | ppl 35.45\n", + "| epoch 35 | 2200/ 3181 batches | lr 0.87 | ms/batch 61.41 | loss 3.56 | ppl 35.07\n", + "| epoch 35 | 2400/ 3181 batches | lr 0.87 | ms/batch 61.37 | loss 3.57 | ppl 35.36\n", + "| epoch 35 | 2600/ 3181 batches | lr 0.87 | ms/batch 61.32 | loss 3.51 | ppl 33.39\n", + "| epoch 35 | 2800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.59 | ppl 36.19\n", + "| epoch 35 | 3000/ 3181 batches | lr 0.87 | ms/batch 61.39 | loss 3.50 | ppl 33.10\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 35 | time: 206.71s | valid loss 5.84 | valid ppl 345.09\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 36 | 200/ 3181 batches | lr 0.83 | ms/batch 61.71 | loss 3.63 | ppl 37.53\n", + "| epoch 36 | 400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.56 | ppl 35.03\n", + "| epoch 36 | 600/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.53 | ppl 34.18\n", + "| epoch 36 | 800/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.59 | ppl 36.15\n", + "| epoch 36 | 1000/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.60 | ppl 36.47\n", + "| epoch 36 | 1200/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.56 | ppl 35.02\n", + "| epoch 36 | 1400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.59 | ppl 36.26\n", + "| epoch 36 | 1600/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.41\n", + "| epoch 36 | 1800/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.57 | ppl 35.56\n", + "| epoch 36 | 2000/ 3181 batches | lr 0.83 | ms/batch 61.45 | loss 3.55 | ppl 34.90\n", + "| epoch 36 | 2200/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.54 | ppl 34.47\n", + "| epoch 36 | 2400/ 3181 batches | lr 0.83 | ms/batch 61.35 | loss 3.55 | ppl 34.76\n", + "| epoch 36 | 2600/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.49 | ppl 32.79\n", + "| epoch 36 | 2800/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.60\n", + "| epoch 36 | 3000/ 3181 batches | lr 0.83 | ms/batch 61.44 | loss 3.48 | ppl 32.46\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 36 | time: 206.71s | valid loss 5.83 | valid ppl 339.42\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 37 | 200/ 3181 batches | lr 0.79 | ms/batch 61.75 | loss 3.61 | ppl 37.13\n", + "| epoch 37 | 400/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.54 | ppl 34.55\n", + "| epoch 37 | 600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.51 | ppl 33.58\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 41 | time: 152.85s | valid loss 5.88 | valid ppl 356.84\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 42 | 200/ 2727 batches | lr 0.61 | ms/batch 53.80 | loss 3.47 | ppl 32.18\n", - "| epoch 42 | 400/ 2727 batches | lr 0.61 | ms/batch 53.55 | loss 3.41 | ppl 30.24\n", - "| epoch 42 | 600/ 2727 batches | lr 0.61 | ms/batch 53.57 | loss 3.41 | ppl 30.26\n", - "| epoch 42 | 800/ 2727 batches | lr 0.61 | ms/batch 53.56 | loss 3.41 | ppl 30.22\n", - "| epoch 42 | 1000/ 2727 batches | lr 0.61 | ms/batch 53.52 | loss 3.37 | ppl 29.20\n", - "| epoch 42 | 1200/ 2727 batches | lr 0.61 | ms/batch 53.55 | loss 3.41 | ppl 30.19\n", - "| epoch 42 | 1400/ 2727 batches | lr 0.61 | ms/batch 53.57 | loss 3.37 | ppl 29.18\n", - "| epoch 42 | 1600/ 2727 batches | lr 0.61 | ms/batch 53.55 | loss 3.37 | ppl 29.03\n", - "| epoch 42 | 1800/ 2727 batches | lr 0.61 | ms/batch 53.58 | loss 3.38 | ppl 29.29\n", - "| epoch 42 | 2000/ 2727 batches | lr 0.61 | ms/batch 53.56 | loss 3.37 | ppl 29.19\n", - "| epoch 42 | 2200/ 2727 batches | lr 0.61 | ms/batch 53.56 | loss 3.40 | ppl 30.07\n", - "| epoch 42 | 2400/ 2727 batches | lr 0.61 | ms/batch 53.57 | loss 3.39 | ppl 29.71\n", - "| epoch 42 | 2600/ 2727 batches | lr 0.61 | ms/batch 53.51 | loss 3.41 | ppl 30.28\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 42 | time: 152.86s | valid loss 5.89 | valid ppl 361.10\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 43 | 200/ 2727 batches | lr 0.58 | ms/batch 53.84 | loss 3.46 | ppl 31.79\n", - "| epoch 43 | 400/ 2727 batches | lr 0.58 | ms/batch 53.55 | loss 3.40 | ppl 29.93\n", - "| epoch 43 | 600/ 2727 batches | lr 0.58 | ms/batch 53.54 | loss 3.41 | ppl 30.14\n", - "| epoch 43 | 800/ 2727 batches | lr 0.58 | ms/batch 53.54 | loss 3.40 | ppl 29.82\n", - "| epoch 43 | 1000/ 2727 batches | lr 0.58 | ms/batch 53.55 | loss 3.37 | ppl 28.96\n", - "| epoch 43 | 1200/ 2727 batches | lr 0.58 | ms/batch 53.52 | loss 3.40 | ppl 29.83\n", - "| epoch 43 | 1400/ 2727 batches | lr 0.58 | ms/batch 53.55 | loss 3.36 | ppl 28.81\n", - "| epoch 43 | 1600/ 2727 batches | lr 0.58 | ms/batch 53.53 | loss 3.35 | ppl 28.63\n", - "| epoch 43 | 1800/ 2727 batches | lr 0.58 | ms/batch 53.57 | loss 3.37 | ppl 28.98\n", - "| epoch 43 | 2000/ 2727 batches | lr 0.58 | ms/batch 53.56 | loss 3.36 | ppl 28.86\n", - "| epoch 43 | 2200/ 2727 batches | lr 0.58 | ms/batch 53.57 | loss 3.39 | ppl 29.56\n", - "| epoch 43 | 2400/ 2727 batches | lr 0.58 | ms/batch 53.56 | loss 3.38 | ppl 29.23\n", - "| epoch 43 | 2600/ 2727 batches | lr 0.58 | ms/batch 53.55 | loss 3.39 | ppl 29.80\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 43 | time: 152.86s | valid loss 5.89 | valid ppl 360.64\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 44 | 200/ 2727 batches | lr 0.55 | ms/batch 53.79 | loss 3.45 | ppl 31.40\n", - "| epoch 44 | 400/ 2727 batches | lr 0.55 | ms/batch 53.57 | loss 3.39 | ppl 29.58\n", - "| epoch 44 | 600/ 2727 batches | lr 0.55 | ms/batch 53.51 | loss 3.39 | ppl 29.67\n", - "| epoch 44 | 800/ 2727 batches | lr 0.55 | ms/batch 53.50 | loss 3.39 | ppl 29.62\n", - "| epoch 44 | 1000/ 2727 batches | lr 0.55 | ms/batch 53.52 | loss 3.35 | ppl 28.53\n", - "| epoch 44 | 1200/ 2727 batches | lr 0.55 | ms/batch 53.51 | loss 3.38 | ppl 29.51\n", - "| epoch 44 | 1400/ 2727 batches | lr 0.55 | ms/batch 53.52 | loss 3.34 | ppl 28.34\n", - "| epoch 44 | 1600/ 2727 batches | lr 0.55 | ms/batch 53.54 | loss 3.35 | ppl 28.40\n", - "| epoch 44 | 1800/ 2727 batches | lr 0.55 | ms/batch 53.53 | loss 3.36 | ppl 28.73\n", - "| epoch 44 | 2000/ 2727 batches | lr 0.55 | ms/batch 53.50 | loss 3.35 | ppl 28.62\n", - "| epoch 44 | 2200/ 2727 batches | lr 0.55 | ms/batch 53.53 | loss 3.38 | ppl 29.46\n", - "| epoch 44 | 2400/ 2727 batches | lr 0.55 | ms/batch 53.56 | loss 3.37 | ppl 29.15\n", - "| epoch 44 | 2600/ 2727 batches | lr 0.55 | ms/batch 53.51 | loss 3.39 | ppl 29.55\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 44 | time: 152.80s | valid loss 5.90 | valid ppl 363.28\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 45 | 200/ 2727 batches | lr 0.52 | ms/batch 53.82 | loss 3.44 | ppl 31.17\n", - "| epoch 45 | 400/ 2727 batches | lr 0.52 | ms/batch 53.53 | loss 3.37 | ppl 29.21\n", - "| epoch 45 | 600/ 2727 batches | lr 0.52 | ms/batch 53.54 | loss 3.38 | ppl 29.47\n", - "| epoch 45 | 800/ 2727 batches | lr 0.52 | ms/batch 53.56 | loss 3.37 | ppl 29.18\n", - "| epoch 45 | 1000/ 2727 batches | lr 0.52 | ms/batch 53.56 | loss 3.34 | ppl 28.18\n", - "| epoch 45 | 1200/ 2727 batches | lr 0.52 | ms/batch 53.55 | loss 3.37 | ppl 29.21\n", - "| epoch 45 | 1400/ 2727 batches | lr 0.52 | ms/batch 53.59 | loss 3.34 | ppl 28.17\n", - "| epoch 45 | 1600/ 2727 batches | lr 0.52 | ms/batch 53.60 | loss 3.34 | ppl 28.12\n", - "| epoch 45 | 1800/ 2727 batches | lr 0.52 | ms/batch 53.56 | loss 3.34 | ppl 28.34\n", - "| epoch 45 | 2000/ 2727 batches | lr 0.52 | ms/batch 53.60 | loss 3.34 | ppl 28.15\n", - "| epoch 45 | 2200/ 2727 batches | lr 0.52 | ms/batch 53.53 | loss 3.37 | ppl 29.09\n", - "| epoch 45 | 2400/ 2727 batches | lr 0.52 | ms/batch 53.58 | loss 3.36 | ppl 28.73\n", - "| epoch 45 | 2600/ 2727 batches | lr 0.52 | ms/batch 53.57 | loss 3.38 | ppl 29.24\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 45 | time: 152.91s | valid loss 5.88 | valid ppl 357.62\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 46 | 200/ 2727 batches | lr 0.50 | ms/batch 53.82 | loss 3.43 | ppl 30.82\n", - "| epoch 46 | 400/ 2727 batches | lr 0.50 | ms/batch 53.58 | loss 3.37 | ppl 29.09\n", - "| epoch 46 | 600/ 2727 batches | lr 0.50 | ms/batch 53.60 | loss 3.37 | ppl 29.09\n", - "| epoch 46 | 800/ 2727 batches | lr 0.50 | ms/batch 53.53 | loss 3.37 | ppl 29.05\n", - "| epoch 46 | 1000/ 2727 batches | lr 0.50 | ms/batch 53.49 | loss 3.33 | ppl 27.90\n", - "| epoch 46 | 1200/ 2727 batches | lr 0.50 | ms/batch 53.54 | loss 3.36 | ppl 28.87\n", - "| epoch 46 | 1400/ 2727 batches | lr 0.50 | ms/batch 53.54 | loss 3.33 | ppl 27.82\n", - "| epoch 46 | 1600/ 2727 batches | lr 0.50 | ms/batch 53.51 | loss 3.33 | ppl 28.01\n", - "| epoch 46 | 1800/ 2727 batches | lr 0.50 | ms/batch 53.53 | loss 3.34 | ppl 28.27\n", - "| epoch 46 | 2000/ 2727 batches | lr 0.50 | ms/batch 53.55 | loss 3.33 | ppl 27.98\n", - "| epoch 46 | 2200/ 2727 batches | lr 0.50 | ms/batch 53.55 | loss 3.36 | ppl 28.71\n", - "| epoch 46 | 2400/ 2727 batches | lr 0.50 | ms/batch 53.59 | loss 3.36 | ppl 28.66\n", - "| epoch 46 | 2600/ 2727 batches | lr 0.50 | ms/batch 53.54 | loss 3.37 | ppl 29.06\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 46 | time: 152.86s | valid loss 5.90 | valid ppl 365.76\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 47 | 200/ 2727 batches | lr 0.47 | ms/batch 53.82 | loss 3.42 | ppl 30.45\n", - "| epoch 47 | 400/ 2727 batches | lr 0.47 | ms/batch 53.51 | loss 3.36 | ppl 28.78\n", - "| epoch 47 | 600/ 2727 batches | lr 0.47 | ms/batch 53.48 | loss 3.36 | ppl 28.84\n", - "| epoch 47 | 800/ 2727 batches | lr 0.47 | ms/batch 53.53 | loss 3.36 | ppl 28.80\n", - "| epoch 47 | 1000/ 2727 batches | lr 0.47 | ms/batch 53.54 | loss 3.32 | ppl 27.68\n", - "| epoch 47 | 1200/ 2727 batches | lr 0.47 | ms/batch 53.51 | loss 3.35 | ppl 28.58\n", - "| epoch 47 | 1400/ 2727 batches | lr 0.47 | ms/batch 53.54 | loss 3.32 | ppl 27.63\n", - "| epoch 47 | 1600/ 2727 batches | lr 0.47 | ms/batch 53.53 | loss 3.32 | ppl 27.62\n", - "| epoch 47 | 1800/ 2727 batches | lr 0.47 | ms/batch 53.55 | loss 3.33 | ppl 27.84\n", - "| epoch 47 | 2000/ 2727 batches | lr 0.47 | ms/batch 53.55 | loss 3.33 | ppl 27.85\n" + "| epoch 37 | 800/ 3181 batches | lr 0.79 | ms/batch 61.32 | loss 3.57 | ppl 35.62\n", + "| epoch 37 | 1000/ 3181 batches | lr 0.79 | ms/batch 61.30 | loss 3.58 | ppl 35.97\n", + "| epoch 37 | 1200/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.54 | ppl 34.53\n", + "| epoch 37 | 1400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.58 | ppl 35.77\n", + "| epoch 37 | 1600/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.55 | ppl 34.77\n", + "| epoch 37 | 1800/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.55 | ppl 34.93\n", + "| epoch 37 | 2000/ 3181 batches | lr 0.79 | ms/batch 61.41 | loss 3.53 | ppl 34.29\n", + "| epoch 37 | 2200/ 3181 batches | lr 0.79 | ms/batch 61.45 | loss 3.53 | ppl 34.02\n", + "| epoch 37 | 2400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.53 | ppl 34.29\n", + "| epoch 37 | 2600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.48 | ppl 32.30\n", + "| epoch 37 | 2800/ 3181 batches | lr 0.79 | ms/batch 61.42 | loss 3.56 | ppl 35.19\n", + "| epoch 37 | 3000/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.47 | ppl 32.07\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 37 | time: 206.68s | valid loss 5.87 | valid ppl 352.86\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 38 | 200/ 3181 batches | lr 0.75 | ms/batch 61.67 | loss 3.59 | ppl 36.37\n", + "| epoch 38 | 400/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.53 | ppl 34.01\n", + "| epoch 38 | 600/ 3181 batches | lr 0.75 | ms/batch 61.39 | loss 3.50 | ppl 33.04\n", + "| epoch 38 | 800/ 3181 batches | lr 0.75 | ms/batch 61.38 | loss 3.56 | ppl 35.12\n", + "| epoch 38 | 1000/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.57 | ppl 35.51\n", + "| epoch 38 | 1200/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n", + "| epoch 38 | 1400/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.56 | ppl 35.27\n", + "| epoch 38 | 1600/ 3181 batches | lr 0.75 | ms/batch 61.46 | loss 3.54 | ppl 34.43\n", + "| epoch 38 | 1800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.47\n", + "| epoch 38 | 2000/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n", + "| epoch 38 | 2200/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.52 | ppl 33.72\n", + "| epoch 38 | 2400/ 3181 batches | lr 0.75 | ms/batch 61.41 | loss 3.52 | ppl 33.79\n", + "| epoch 38 | 2600/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.46 | ppl 31.89\n", + "| epoch 38 | 2800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.41\n", + "| epoch 38 | 3000/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.45 | ppl 31.48\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 38 | time: 206.80s | valid loss 5.88 | valid ppl 358.16\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 39 | 200/ 3181 batches | lr 0.71 | ms/batch 61.71 | loss 3.58 | ppl 36.05\n", + "| epoch 39 | 400/ 3181 batches | lr 0.71 | ms/batch 61.40 | loss 3.52 | ppl 33.71\n", + "| epoch 39 | 600/ 3181 batches | lr 0.71 | ms/batch 61.44 | loss 3.49 | ppl 32.73\n", + "| epoch 39 | 800/ 3181 batches | lr 0.71 | ms/batch 61.34 | loss 3.54 | ppl 34.55\n", + "| epoch 39 | 1000/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.56 | ppl 35.03\n", + "| epoch 39 | 1200/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.38\n", + "| epoch 39 | 1400/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.55 | ppl 34.77\n", + "| epoch 39 | 1600/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.52 | ppl 33.79\n", + "| epoch 39 | 1800/ 3181 batches | lr 0.71 | ms/batch 61.46 | loss 3.52 | ppl 33.85\n", + "| epoch 39 | 2000/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.32\n", + "| epoch 39 | 2200/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.50 | ppl 33.21\n", + "| epoch 39 | 2400/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.51 | ppl 33.39\n", + "| epoch 39 | 2600/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.45 | ppl 31.48\n", + "| epoch 39 | 2800/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.53 | ppl 34.01\n", + "| epoch 39 | 3000/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.43 | ppl 30.93\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 39 | time: 206.75s | valid loss 5.92 | valid ppl 370.78\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 40 | 200/ 3181 batches | lr 0.68 | ms/batch 61.74 | loss 3.57 | ppl 35.52\n", + "| epoch 40 | 400/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.50 | ppl 33.05\n", + "| epoch 40 | 600/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.47 | ppl 32.19\n", + "| epoch 40 | 800/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.53 | ppl 34.18\n", + "| epoch 40 | 1000/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.54 | ppl 34.37\n", + "| epoch 40 | 1200/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.50 | ppl 33.04\n", + "| epoch 40 | 1400/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.53 | ppl 34.21\n", + "| epoch 40 | 1600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.51 | ppl 33.31\n", + "| epoch 40 | 1800/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.51 | ppl 33.39\n", + "| epoch 40 | 2000/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.50 | ppl 32.98\n", + "| epoch 40 | 2200/ 3181 batches | lr 0.68 | ms/batch 61.40 | loss 3.49 | ppl 32.67\n", + "| epoch 40 | 2400/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.49 | ppl 32.91\n", + "| epoch 40 | 2600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.44 | ppl 31.10\n", + "| epoch 40 | 2800/ 3181 batches | lr 0.68 | ms/batch 61.34 | loss 3.51 | ppl 33.46\n", + "| epoch 40 | 3000/ 3181 batches | lr 0.68 | ms/batch 61.28 | loss 3.42 | ppl 30.62\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 40 | time: 206.76s | valid loss 5.93 | valid ppl 376.29\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 41 | 200/ 3181 batches | lr 0.64 | ms/batch 61.65 | loss 3.56 | ppl 35.06\n", + "| epoch 41 | 400/ 3181 batches | lr 0.64 | ms/batch 61.47 | loss 3.49 | ppl 32.70\n", + "| epoch 41 | 600/ 3181 batches | lr 0.64 | ms/batch 61.39 | loss 3.45 | ppl 31.62\n", + "| epoch 41 | 800/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.52 | ppl 33.65\n", + "| epoch 41 | 1000/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.53 | ppl 33.98\n", + "| epoch 41 | 1200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.49 | ppl 32.75\n", + "| epoch 41 | 1400/ 3181 batches | lr 0.64 | ms/batch 61.41 | loss 3.52 | ppl 33.71\n", + "| epoch 41 | 1600/ 3181 batches | lr 0.64 | ms/batch 61.45 | loss 3.50 | ppl 33.06\n", + "| epoch 41 | 1800/ 3181 batches | lr 0.64 | ms/batch 61.35 | loss 3.50 | ppl 33.27\n", + "| epoch 41 | 2000/ 3181 batches | lr 0.64 | ms/batch 61.43 | loss 3.49 | ppl 32.72\n", + "| epoch 41 | 2200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.47 | ppl 32.12\n", + "| epoch 41 | 2400/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.48 | ppl 32.54\n", + "| epoch 41 | 2600/ 3181 batches | lr 0.64 | ms/batch 61.44 | loss 3.43 | ppl 30.92\n", + "| epoch 41 | 2800/ 3181 batches | lr 0.64 | ms/batch 61.37 | loss 3.50 | ppl 33.15\n", + "| epoch 41 | 3000/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.41 | ppl 30.36\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 41 | time: 206.74s | valid loss 5.91 | valid ppl 369.07\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 42 | 200/ 3181 batches | lr 0.61 | ms/batch 61.72 | loss 3.55 | ppl 34.66\n", + "| epoch 42 | 400/ 3181 batches | lr 0.61 | ms/batch 61.40 | loss 3.48 | ppl 32.31\n", + "| epoch 42 | 600/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.45 | ppl 31.42\n", + "| epoch 42 | 800/ 3181 batches | lr 0.61 | ms/batch 61.36 | loss 3.51 | ppl 33.32\n", + "| epoch 42 | 1000/ 3181 batches | lr 0.61 | ms/batch 61.42 | loss 3.52 | ppl 33.79\n", + "| epoch 42 | 1200/ 3181 batches | lr 0.61 | ms/batch 61.43 | loss 3.47 | ppl 32.12\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| epoch 42 | 1400/ 3181 batches | lr 0.61 | ms/batch 61.45 | loss 3.50 | ppl 33.28\n", + "| epoch 42 | 1600/ 3181 batches | lr 0.61 | ms/batch 61.41 | loss 3.49 | ppl 32.66\n", + "| epoch 42 | 1800/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.49 | ppl 32.80\n", + "| epoch 42 | 2000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.47 | ppl 32.27\n", + "| epoch 42 | 2200/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.46 | ppl 31.79\n", + "| epoch 42 | 2400/ 3181 batches | lr 0.61 | ms/batch 61.44 | loss 3.48 | ppl 32.32\n", + "| epoch 42 | 2600/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.42 | ppl 30.42\n", + "| epoch 42 | 2800/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.50 | ppl 32.97\n", + "| epoch 42 | 3000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.40 | ppl 29.94\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 42 | time: 206.74s | valid loss 5.92 | valid ppl 371.93\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 43 | 200/ 3181 batches | lr 0.58 | ms/batch 61.68 | loss 3.53 | ppl 34.15\n", + "| epoch 43 | 400/ 3181 batches | lr 0.58 | ms/batch 61.40 | loss 3.47 | ppl 32.05\n", + "| epoch 43 | 600/ 3181 batches | lr 0.58 | ms/batch 61.38 | loss 3.44 | ppl 31.09\n", + "| epoch 43 | 800/ 3181 batches | lr 0.58 | ms/batch 61.37 | loss 3.50 | ppl 33.06\n", + "| epoch 43 | 1000/ 3181 batches | lr 0.58 | ms/batch 61.42 | loss 3.51 | ppl 33.36\n", + "| epoch 43 | 1200/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 31.98\n", + "| epoch 43 | 1400/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.50 | ppl 32.97\n", + "| epoch 43 | 1600/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 32.29\n", + "| epoch 43 | 1800/ 3181 batches | lr 0.58 | ms/batch 61.46 | loss 3.47 | ppl 32.22\n", + "| epoch 43 | 2000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.46 | ppl 31.72\n", + "| epoch 43 | 2200/ 3181 batches | lr 0.58 | ms/batch 61.33 | loss 3.45 | ppl 31.37\n", + "| epoch 43 | 2400/ 3181 batches | lr 0.58 | ms/batch 61.36 | loss 3.46 | ppl 31.81\n", + "| epoch 43 | 2600/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.41 | ppl 30.17\n", + "| epoch 43 | 2800/ 3181 batches | lr 0.58 | ms/batch 61.44 | loss 3.49 | ppl 32.63\n", + "| epoch 43 | 3000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.39 | ppl 29.80\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 43 | time: 206.72s | valid loss 5.98 | valid ppl 394.70\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 44 | 200/ 3181 batches | lr 0.55 | ms/batch 61.69 | loss 3.52 | ppl 33.79\n", + "| epoch 44 | 400/ 3181 batches | lr 0.55 | ms/batch 61.47 | loss 3.46 | ppl 31.81\n", + "| epoch 44 | 600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.42 | ppl 30.69\n", + "| epoch 44 | 800/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.48 | ppl 32.49\n", + "| epoch 44 | 1000/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.49 | ppl 32.84\n", + "| epoch 44 | 1200/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.45 | ppl 31.56\n", + "| epoch 44 | 1400/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.48 | ppl 32.59\n", + "| epoch 44 | 1600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.46 | ppl 31.93\n", + "| epoch 44 | 1800/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.46 | ppl 31.94\n", + "| epoch 44 | 2000/ 3181 batches | lr 0.55 | ms/batch 61.36 | loss 3.45 | ppl 31.62\n", + "| epoch 44 | 2200/ 3181 batches | lr 0.55 | ms/batch 61.37 | loss 3.44 | ppl 31.16\n", + "| epoch 44 | 2400/ 3181 batches | lr 0.55 | ms/batch 61.35 | loss 3.45 | ppl 31.47\n", + "| epoch 44 | 2600/ 3181 batches | lr 0.55 | ms/batch 61.40 | loss 3.39 | ppl 29.77\n", + "| epoch 44 | 2800/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.47 | ppl 32.19\n", + "| epoch 44 | 3000/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.38 | ppl 29.40\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 44 | time: 206.75s | valid loss 5.96 | valid ppl 389.15\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 45 | 200/ 3181 batches | lr 0.52 | ms/batch 61.74 | loss 3.51 | ppl 33.51\n", + "| epoch 45 | 400/ 3181 batches | lr 0.52 | ms/batch 61.41 | loss 3.45 | ppl 31.39\n", + "| epoch 45 | 600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.42 | ppl 30.55\n", + "| epoch 45 | 800/ 3181 batches | lr 0.52 | ms/batch 61.35 | loss 3.48 | ppl 32.50\n", + "| epoch 45 | 1000/ 3181 batches | lr 0.52 | ms/batch 61.38 | loss 3.48 | ppl 32.51\n", + "| epoch 45 | 1200/ 3181 batches | lr 0.52 | ms/batch 61.33 | loss 3.45 | ppl 31.39\n", + "| epoch 45 | 1400/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.47 | ppl 32.20\n", + "| epoch 45 | 1600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.45 | ppl 31.63\n", + "| epoch 45 | 1800/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.45 | ppl 31.61\n", + "| epoch 45 | 2000/ 3181 batches | lr 0.52 | ms/batch 61.36 | loss 3.44 | ppl 31.17\n", + "| epoch 45 | 2200/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.43 | ppl 31.02\n", + "| epoch 45 | 2400/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.44 | ppl 31.18\n", + "| epoch 45 | 2600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.39 | ppl 29.52\n", + "| epoch 45 | 2800/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.46 | ppl 31.72\n", + "| epoch 45 | 3000/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.37 | ppl 29.15\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 45 | time: 206.77s | valid loss 5.99 | valid ppl 398.09\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 46 | 200/ 3181 batches | lr 0.50 | ms/batch 61.71 | loss 3.50 | ppl 33.04\n", + "| epoch 46 | 400/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.44 | ppl 31.04\n", + "| epoch 46 | 600/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.41 | ppl 30.26\n", + "| epoch 46 | 800/ 3181 batches | lr 0.50 | ms/batch 61.47 | loss 3.47 | ppl 32.01\n", + "| epoch 46 | 1000/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.47 | ppl 32.08\n", + "| epoch 46 | 1200/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.43 | ppl 30.86\n", + "| epoch 46 | 1400/ 3181 batches | lr 0.50 | ms/batch 61.34 | loss 3.47 | ppl 32.15\n", + "| epoch 46 | 1600/ 3181 batches | lr 0.50 | ms/batch 61.44 | loss 3.44 | ppl 31.32\n", + "| epoch 46 | 1800/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.45 | ppl 31.49\n", + "| epoch 46 | 2000/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.44 | ppl 31.04\n", + "| epoch 46 | 2200/ 3181 batches | lr 0.50 | ms/batch 61.48 | loss 3.42 | ppl 30.63\n", + "| epoch 46 | 2400/ 3181 batches | lr 0.50 | ms/batch 61.35 | loss 3.43 | ppl 30.95\n", + "| epoch 46 | 2600/ 3181 batches | lr 0.50 | ms/batch 61.37 | loss 3.38 | ppl 29.38\n", + "| epoch 46 | 2800/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.45 | ppl 31.45\n", + "| epoch 46 | 3000/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.37 | ppl 28.96\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 46 | time: 206.77s | valid loss 5.96 | valid ppl 389.00\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 47 | 200/ 3181 batches | lr 0.47 | ms/batch 61.74 | loss 3.49 | ppl 32.78\n", + "| epoch 47 | 400/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.76\n", + "| epoch 47 | 600/ 3181 batches | lr 0.47 | ms/batch 61.47 | loss 3.40 | ppl 29.86\n", + "| epoch 47 | 800/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.86\n", + "| epoch 47 | 1000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.46 | ppl 31.90\n", + "| epoch 47 | 1200/ 3181 batches | lr 0.47 | ms/batch 61.38 | loss 3.42 | ppl 30.71\n", + "| epoch 47 | 1400/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.91\n", + "| epoch 47 | 1600/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 31.00\n", + "| epoch 47 | 1800/ 3181 batches | lr 0.47 | ms/batch 61.54 | loss 3.44 | ppl 31.18\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 47 | 2200/ 2727 batches | lr 0.47 | ms/batch 53.54 | loss 3.35 | ppl 28.47\n", - "| epoch 47 | 2400/ 2727 batches | lr 0.47 | ms/batch 53.51 | loss 3.34 | ppl 28.17\n", - "| epoch 47 | 2600/ 2727 batches | lr 0.47 | ms/batch 53.54 | loss 3.36 | ppl 28.89\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 47 | time: 152.80s | valid loss 5.88 | valid ppl 358.73\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 48 | 200/ 2727 batches | lr 0.45 | ms/batch 53.87 | loss 3.41 | ppl 30.22\n", - "| epoch 48 | 400/ 2727 batches | lr 0.45 | ms/batch 53.55 | loss 3.35 | ppl 28.59\n", - "| epoch 48 | 600/ 2727 batches | lr 0.45 | ms/batch 53.56 | loss 3.36 | ppl 28.69\n", - "| epoch 48 | 800/ 2727 batches | lr 0.45 | ms/batch 53.53 | loss 3.35 | ppl 28.56\n", - "| epoch 48 | 1000/ 2727 batches | lr 0.45 | ms/batch 53.50 | loss 3.31 | ppl 27.35\n", - "| epoch 48 | 1200/ 2727 batches | lr 0.45 | ms/batch 53.55 | loss 3.34 | ppl 28.31\n", - "| epoch 48 | 1400/ 2727 batches | lr 0.45 | ms/batch 53.52 | loss 3.31 | ppl 27.48\n", - "| epoch 48 | 1600/ 2727 batches | lr 0.45 | ms/batch 53.53 | loss 3.31 | ppl 27.50\n", - "| epoch 48 | 1800/ 2727 batches | lr 0.45 | ms/batch 53.51 | loss 3.32 | ppl 27.60\n", - "| epoch 48 | 2000/ 2727 batches | lr 0.45 | ms/batch 53.52 | loss 3.31 | ppl 27.47\n", - "| epoch 48 | 2200/ 2727 batches | lr 0.45 | ms/batch 53.54 | loss 3.34 | ppl 28.19\n", - "| epoch 48 | 2400/ 2727 batches | lr 0.45 | ms/batch 53.53 | loss 3.33 | ppl 27.93\n", - "| epoch 48 | 2600/ 2727 batches | lr 0.45 | ms/batch 53.54 | loss 3.35 | ppl 28.55\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 48 | time: 152.84s | valid loss 5.90 | valid ppl 365.29\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 49 | 200/ 2727 batches | lr 0.43 | ms/batch 53.78 | loss 3.40 | ppl 30.09\n", - "| epoch 49 | 400/ 2727 batches | lr 0.43 | ms/batch 53.53 | loss 3.35 | ppl 28.49\n", - "| epoch 49 | 600/ 2727 batches | lr 0.43 | ms/batch 53.54 | loss 3.35 | ppl 28.53\n", - "| epoch 49 | 800/ 2727 batches | lr 0.43 | ms/batch 53.55 | loss 3.34 | ppl 28.24\n", - "| epoch 49 | 1000/ 2727 batches | lr 0.43 | ms/batch 53.54 | loss 3.31 | ppl 27.33\n", - "| epoch 49 | 1200/ 2727 batches | lr 0.43 | ms/batch 53.58 | loss 3.34 | ppl 28.22\n", - "| epoch 49 | 1400/ 2727 batches | lr 0.43 | ms/batch 53.51 | loss 3.30 | ppl 27.17\n", - "| epoch 49 | 1600/ 2727 batches | lr 0.43 | ms/batch 53.53 | loss 3.30 | ppl 27.18\n", - "| epoch 49 | 1800/ 2727 batches | lr 0.43 | ms/batch 53.53 | loss 3.31 | ppl 27.27\n", - "| epoch 49 | 2000/ 2727 batches | lr 0.43 | ms/batch 53.51 | loss 3.31 | ppl 27.28\n", - "| epoch 49 | 2200/ 2727 batches | lr 0.43 | ms/batch 53.52 | loss 3.33 | ppl 28.07\n", - "| epoch 49 | 2400/ 2727 batches | lr 0.43 | ms/batch 53.49 | loss 3.33 | ppl 27.83\n", - "| epoch 49 | 2600/ 2727 batches | lr 0.43 | ms/batch 53.57 | loss 3.34 | ppl 28.35\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 49 | time: 152.82s | valid loss 5.92 | valid ppl 373.45\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 50 | 200/ 2727 batches | lr 0.40 | ms/batch 53.81 | loss 3.39 | ppl 29.78\n", - "| epoch 50 | 400/ 2727 batches | lr 0.40 | ms/batch 53.56 | loss 3.34 | ppl 28.10\n", - "| epoch 50 | 600/ 2727 batches | lr 0.40 | ms/batch 53.50 | loss 3.34 | ppl 28.15\n", - "| epoch 50 | 800/ 2727 batches | lr 0.40 | ms/batch 53.54 | loss 3.33 | ppl 27.93\n", - "| epoch 50 | 1000/ 2727 batches | lr 0.40 | ms/batch 53.52 | loss 3.30 | ppl 27.23\n", - "| epoch 50 | 1200/ 2727 batches | lr 0.40 | ms/batch 53.54 | loss 3.33 | ppl 27.84\n", - "| epoch 50 | 1400/ 2727 batches | lr 0.40 | ms/batch 53.56 | loss 3.29 | ppl 26.92\n", - "| epoch 50 | 1600/ 2727 batches | lr 0.40 | ms/batch 53.58 | loss 3.30 | ppl 27.10\n", - "| epoch 50 | 1800/ 2727 batches | lr 0.40 | ms/batch 53.55 | loss 3.30 | ppl 27.16\n", - "| epoch 50 | 2000/ 2727 batches | lr 0.40 | ms/batch 53.55 | loss 3.30 | ppl 27.02\n", - "| epoch 50 | 2200/ 2727 batches | lr 0.40 | ms/batch 53.54 | loss 3.33 | ppl 27.89\n", - "| epoch 50 | 2400/ 2727 batches | lr 0.40 | ms/batch 53.58 | loss 3.32 | ppl 27.57\n", - "| epoch 50 | 2600/ 2727 batches | lr 0.40 | ms/batch 53.57 | loss 3.33 | ppl 27.91\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 50 | time: 152.87s | valid loss 5.92 | valid ppl 371.61\n", + "| epoch 47 | 2000/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.73\n", + "| epoch 47 | 2200/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.41 | ppl 30.37\n", + "| epoch 47 | 2400/ 3181 batches | lr 0.47 | ms/batch 61.52 | loss 3.42 | ppl 30.63\n", + "| epoch 47 | 2600/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.37 | ppl 28.98\n", + "| epoch 47 | 2800/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.45 | ppl 31.34\n", + "| epoch 47 | 3000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.35 | ppl 28.50\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 47 | time: 206.90s | valid loss 5.96 | valid ppl 388.68\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 48 | 200/ 3181 batches | lr 0.45 | ms/batch 61.70 | loss 3.48 | ppl 32.61\n", + "| epoch 48 | 400/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.51\n", + "| epoch 48 | 600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.39 | ppl 29.65\n", + "| epoch 48 | 800/ 3181 batches | lr 0.45 | ms/batch 61.36 | loss 3.46 | ppl 31.70\n", + "| epoch 48 | 1000/ 3181 batches | lr 0.45 | ms/batch 61.50 | loss 3.46 | ppl 31.66\n", + "| epoch 48 | 1200/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.42 | ppl 30.56\n", + "| epoch 48 | 1400/ 3181 batches | lr 0.45 | ms/batch 61.49 | loss 3.45 | ppl 31.65\n", + "| epoch 48 | 1600/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.66\n", + "| epoch 48 | 1800/ 3181 batches | lr 0.45 | ms/batch 61.41 | loss 3.43 | ppl 30.74\n", + "| epoch 48 | 2000/ 3181 batches | lr 0.45 | ms/batch 61.40 | loss 3.42 | ppl 30.48\n", + "| epoch 48 | 2200/ 3181 batches | lr 0.45 | ms/batch 61.47 | loss 3.41 | ppl 30.33\n", + "| epoch 48 | 2400/ 3181 batches | lr 0.45 | ms/batch 61.38 | loss 3.41 | ppl 30.38\n", + "| epoch 48 | 2600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.36 | ppl 28.89\n", + "| epoch 48 | 2800/ 3181 batches | lr 0.45 | ms/batch 61.42 | loss 3.43 | ppl 30.96\n", + "| epoch 48 | 3000/ 3181 batches | lr 0.45 | ms/batch 61.39 | loss 3.34 | ppl 28.33\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 48 | time: 206.81s | valid loss 5.95 | valid ppl 383.84\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 49 | 200/ 3181 batches | lr 0.43 | ms/batch 61.73 | loss 3.47 | ppl 32.23\n", + "| epoch 49 | 400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.33\n", + "| epoch 49 | 600/ 3181 batches | lr 0.43 | ms/batch 61.43 | loss 3.38 | ppl 29.52\n", + "| epoch 49 | 800/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.45 | ppl 31.40\n", + "| epoch 49 | 1000/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.46 | ppl 31.72\n", + "| epoch 49 | 1200/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.41 | ppl 30.25\n", + "| epoch 49 | 1400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.45 | ppl 31.41\n", + "| epoch 49 | 1600/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.42 | ppl 30.56\n", + "| epoch 49 | 1800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.75\n", + "| epoch 49 | 2000/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.28\n", + "| epoch 49 | 2200/ 3181 batches | lr 0.43 | ms/batch 61.45 | loss 3.41 | ppl 30.14\n", + "| epoch 49 | 2400/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.41 | ppl 30.22\n", + "| epoch 49 | 2600/ 3181 batches | lr 0.43 | ms/batch 61.44 | loss 3.35 | ppl 28.48\n", + "| epoch 49 | 2800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.72\n", + "| epoch 49 | 3000/ 3181 batches | lr 0.43 | ms/batch 61.37 | loss 3.34 | ppl 28.18\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 49 | time: 206.79s | valid loss 5.98 | valid ppl 395.98\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 50 | 200/ 3181 batches | lr 0.40 | ms/batch 61.71 | loss 3.47 | ppl 32.06\n", + "| epoch 50 | 400/ 3181 batches | lr 0.40 | ms/batch 61.39 | loss 3.41 | ppl 30.15\n", + "| epoch 50 | 600/ 3181 batches | lr 0.40 | ms/batch 61.37 | loss 3.38 | ppl 29.27\n", + "| epoch 50 | 800/ 3181 batches | lr 0.40 | ms/batch 61.42 | loss 3.43 | ppl 31.02\n", + "| epoch 50 | 1000/ 3181 batches | lr 0.40 | ms/batch 61.34 | loss 3.44 | ppl 31.16\n", + "| epoch 50 | 1200/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 29.97\n", + "| epoch 50 | 1400/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.44 | ppl 31.23\n", + "| epoch 50 | 1600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.41 | ppl 30.24\n", + "| epoch 50 | 1800/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.42 | ppl 30.64\n", + "| epoch 50 | 2000/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 30.07\n", + "| epoch 50 | 2200/ 3181 batches | lr 0.40 | ms/batch 61.49 | loss 3.39 | ppl 29.78\n", + "| epoch 50 | 2400/ 3181 batches | lr 0.40 | ms/batch 61.41 | loss 3.40 | ppl 29.98\n", + "| epoch 50 | 2600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.35 | ppl 28.40\n", + "| epoch 50 | 2800/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.43 | ppl 30.72\n", + "| epoch 50 | 3000/ 3181 batches | lr 0.40 | ms/batch 61.44 | loss 3.34 | ppl 28.08\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 50 | time: 206.77s | valid loss 6.01 | valid ppl 407.69\n", "-----------------------------------------------------------------------------------------\n" ] } @@ -1571,7 +1683,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 57, "id": "12fdd0aa", "metadata": { "scrolled": true @@ -1582,7 +1694,7 @@ "output_type": "stream", "text": [ "=========================================================================================\n", - "| End of training | test loss 5.23 | test ppl 186.92\n", + "| End of training | test loss 5.36 | test ppl 213.09\n", "=========================================================================================\n" ] } @@ -1598,7 +1710,7 @@ }, { "cell_type": "markdown", - "id": "3abfa421", + "id": "528c9f10", "metadata": {}, "source": [ "### save trained model to file" @@ -1606,8 +1718,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "id": "a747e692", + "execution_count": 58, + "id": "848af399", "metadata": {}, "outputs": [], "source": [ @@ -1632,7 +1744,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 59, "id": "cfb30fe0", "metadata": {}, "outputs": [], @@ -1654,7 +1766,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 60, "id": "305853e8", "metadata": {}, "outputs": [], @@ -1673,7 +1785,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 61, "id": "afe585d6", "metadata": {}, "outputs": [], @@ -1693,7 +1805,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 62, "id": "8bfaa8bd", "metadata": {}, "outputs": [], @@ -1713,7 +1825,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 63, "id": "6e2c35ba", "metadata": {}, "outputs": [ @@ -1723,7 +1835,7 @@ "device(type='cuda')" ] }, - "execution_count": 50, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1733,6 +1845,35 @@ "device" ] }, + { + "cell_type": "markdown", + "id": "bef90722", + "metadata": {}, + "source": [ + "### optionally load model from file if it was trained already" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "223eed8a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model.load_state_dict(torch.load(\"autocomplete_model\"))" + ] + }, { "cell_type": "markdown", "id": "dd71bdfc", @@ -1743,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 76, "id": "64223e87", "metadata": {}, "outputs": [], @@ -1755,15 +1896,13 @@ "\n", " # Get top N categories\n", " topv, topi = output.topk(n_predictions, 1, True)\n", - " #x, y = output.topk(n_predictions, 1, True)\n", - " #print(x.shape)\n", - " #print(topv.shape)\n", - " # print(topi.shape)\n", + "\n", " predictions = []\n", " for i in range(n_predictions):\n", " value = topv[0][i]\n", " v1, v2 = value.topk(1)\n", " predict_token_index = v2.cpu().detach().numpy()\n", + " print(\"predict token index: \", predict_token_index)\n", " predictions.append(vocab.lookup_token(predict_token_index))\n", " return predictions" ] @@ -1778,15 +1917,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "c83e3b75", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 52, + "execution_count": 77, "id": "b2895698", "metadata": {}, "outputs": [], @@ -1796,15 +1927,14 @@ " is_terminated = False\n", " input_batch = sample_batch\n", " while(not is_terminated):\n", - " # I guess 2*count is need because spaces get counted aswell\n", - " mask_size = bptt+(iteration)\n", - " print(mask_size)\n", + " # 2*count is need because spaces count aswell\n", + " mask_size = bptt+(iteration) \n", " src_mask = generate_square_subsequent_mask(mask_size).to(device)\n", " data = toDataTensor()\n", " for i, d in enumerate(data):\n", " predictions = predict(d, src_mask, num_of_pred)\n", - " print(\"Current input:\", count)\n", - " print(input_batch[count])\n", + " print(\"Current input:\", i)\n", + " print(input_batch[i])\n", " print(\"Possible continuations:\")\n", " for j in range(len(predictions)):\n", " print(j + 1, \": \", predictions[j])\n", @@ -1823,7 +1953,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "id": "13ed9298", "metadata": {}, "outputs": [ @@ -1831,21 +1961,115 @@ "name": "stdout", "output_type": "stream", "text": [ - "3\n", "\n", - "> tensor([ 3, 542, 17])\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'count' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [54], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpredict_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn [52], line 13\u001b[0m, in \u001b[0;36mpredict_loop\u001b[0;34m(num_of_pred)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, d \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(data):\n\u001b[1;32m 12\u001b[0m predictions \u001b[38;5;241m=\u001b[39m predict(d, src_mask, num_of_pred)\n\u001b[0;32m---> 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCurrent input:\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[43mcount\u001b[49m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(input_batch[count])\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPossible continuations:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'count' is not defined" + "> tensor([ 3, 161, 18])\n", + "predict token index: [2]\n", + "predict token index: [5]\n", + "predict token index: [3]\n", + "Current input: 0\n", + "The brain is\n", + "Possible continuations:\n", + "1 : ,\n", + "2 : of\n", + "3 : the\n", + "Choose continuation by index:3\n", + "Text is now:\n", + "The brain is the\n", + "\n", + "> tensor([ 3, 374, 18])\n", + "predict token index: [2]\n", + "predict token index: [5]\n", + "predict token index: [183]\n", + "Current input: 1\n", + "The lung is\n", + "Possible continuations:\n", + "1 : ,\n", + "2 : of\n", + "3 : identified\n", + "Choose continuation by index:3\n", + "Text is now:\n", + "The lung is identified\n", + "\n", + "> tensor([ 3, 161, 18, 3])\n", + "predict token index: [2]\n", + "predict token index: [5]\n", + "predict token index: [132]\n", + "Current input: 0\n", + "The brain is the\n", + "Possible continuations:\n", + "1 : ,\n", + "2 : of\n", + "3 : most\n", + "Choose continuation by index:3\n", + "Text is now:\n", + "The brain is the most\n", + "\n", + "> tensor([ 3, 374, 18, 183])\n", + "predict token index: [2]\n", + "predict token index: [5]\n", + "predict token index: [8]\n", + "Current input: 1\n", + "The lung is identified\n", + "Possible continuations:\n", + "1 : ,\n", + "2 : of\n", + "3 : in\n", + "Choose continuation by index:1\n", + "Text is now:\n", + "The lung is identified ,\n", + "\n", + "> tensor([ 3, 161, 18, 3, 132])\n", + "predict token index: [258]\n", + "predict token index: [5]\n", + "predict token index: [5]\n", + "Current input: 0\n", + "The brain is the most\n", + "Possible continuations:\n", + "1 : common\n", + "2 : of\n", + "3 : of\n", + "Choose continuation by index:1\n", + "Text is now:\n", + "The brain is the most common\n", + "\n", + "> tensor([ 3, 374, 18, 183, 2])\n", + "predict token index: [4]\n", + "predict token index: [4]\n", + "predict token index: [3]\n", + "Current input: 1\n", + "The lung is identified ,\n", + "Possible continuations:\n", + "1 : and\n", + "2 : and\n", + "3 : the\n", + "Choose continuation by index:3\n", + "Text is now:\n", + "The lung is identified , the\n", + "\n", + "> tensor([ 3, 161, 18, 3, 132, 258])\n", + "predict token index: [258]\n", + "predict token index: [1]\n", + "predict token index: [5]\n", + "Current input: 0\n", + "The brain is the most common\n", + "Possible continuations:\n", + "1 : common\n", + "2 : .\n", + "3 : of\n", + "Choose continuation by index:3\n", + "Text is now:\n", + "The brain is the most common of\n", + "\n", + "> tensor([ 3, 374, 18, 183, 2, 3])\n", + "predict token index: [4]\n", + "predict token index: [4]\n", + "predict token index: [3]\n", + "Current input: 1\n", + "The lung is identified , the\n", + "Possible continuations:\n", + "1 : and\n", + "2 : and\n", + "3 : the\n" ] } ],