From 7e86e7f6d12c226941e295c791930d11e41398b7 Mon Sep 17 00:00:00 2001 From: Leonard Starke Date: Wed, 18 Jan 2023 21:47:04 +0100 Subject: [PATCH] merge with local progress from leo --- AutomaticSentenceCompletion.ipynb | 1988 +++++++++++++---------------- 1 file changed, 861 insertions(+), 1127 deletions(-) diff --git a/AutomaticSentenceCompletion.ipynb b/AutomaticSentenceCompletion.ipynb index ca9a2eb..ae1e53c 100644 --- a/AutomaticSentenceCompletion.ipynb +++ b/AutomaticSentenceCompletion.ipynb @@ -14,7 +14,8 @@ "metadata": {}, "source": [ "### Authors:\n", - "- Constantin Fürst\n", + "- Eric Münzberg\n", + "- Shahein Enjjar\t\n", "- Leonard Starke" ] }, @@ -43,14 +44,14 @@ { "cell_type": "code", "execution_count": 2, - "id": "293027a6", + "id": "e1912a79", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2023-01-18 14:27:45-- https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download\n", + "--2023-01-18 15:48:24-- https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download\n", "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", "Resolving cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)... 95.91.21.14\n", "Connecting to cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)|95.91.21.14|:443... connected.\n", @@ -58,9 +59,9 @@ "Length: 1100551 (1.0M) [text/plain]\n", "Saving to: ‘pubmed-query.txt’\n", "\n", - "pubmed-query.txt 100%[===================>] 1.05M 1.91MB/s in 0.6s \n", + "pubmed-query.txt 100%[===================>] 1.05M 1.61MB/s in 0.7s \n", "\n", - "2023-01-18 14:27:47 (1.91 MB/s) - ‘pubmed-query.txt’ saved [1100551/1100551]\n", + "2023-01-18 15:48:25 (1.61 MB/s) - ‘pubmed-query.txt’ saved [1100551/1100551]\n", "\n" ] } @@ -79,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "adfb256a", "metadata": {}, "outputs": [], @@ -100,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "00481ec9", "metadata": {}, "outputs": [ @@ -108,12 +109,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Got 290 records from the query text file\n" + "Got 150000 records from the query text file\n" ] } ], "source": [ + "max_records = 150000\n", "records = getPapers(\"pubmed-query.txt\")\n", + "records = records[:min(max_records,len(records))]\n", "print(f\"Got {len(records)} records from the query text file\")" ] }, @@ -127,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "id": "dcf5c217", "metadata": {}, "outputs": [], @@ -148,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "c3199444", "metadata": {}, "outputs": [ @@ -184,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "id": "daca9db6", "metadata": {}, "outputs": [], @@ -206,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "8d2312db", "metadata": {}, "outputs": [], @@ -224,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "id": "3f23404d", "metadata": {}, "outputs": [], @@ -236,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "id": "8a128d3c", "metadata": {}, "outputs": [], @@ -248,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "id": "97e89986", "metadata": {}, "outputs": [], @@ -260,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "id": "0d6e89c4", "metadata": {}, "outputs": [], @@ -280,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "id": "0bdbc40a", "metadata": {}, "outputs": [], @@ -301,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "id": "a438ab1f", "metadata": {}, "outputs": [], @@ -322,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "id": "0e5bc361", "metadata": {}, "outputs": [], @@ -335,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "id": "dfd7400d", "metadata": {}, "outputs": [], @@ -358,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "c155ee31", "metadata": {}, "outputs": [], @@ -368,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 21, "id": "79b2d248", "metadata": {}, "outputs": [ @@ -378,7 +381,7 @@ "device(type='cuda')" ] }, - "execution_count": 43, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +400,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "a33d722f", "metadata": {}, "outputs": [], @@ -461,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "id": "c2f6d33b", "metadata": {}, "outputs": [], @@ -498,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "9e184841", "metadata": {}, "outputs": [], @@ -522,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "a4def1ac", "metadata": {}, "outputs": [], @@ -544,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "id": "4ab5b8fd", "metadata": {}, "outputs": [], @@ -576,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "id": "c53764da", "metadata": {}, "outputs": [], @@ -600,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "ddaa1d64", "metadata": {}, "outputs": [], @@ -624,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "id": "50ab3fb6", "metadata": {}, "outputs": [], @@ -673,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "id": "3d179bb0", "metadata": {}, "outputs": [], @@ -704,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 31, "id": "09c4d4ce", "metadata": { "scrolled": true @@ -714,966 +717,754 @@ "name": "stdout", "output_type": "stream", "text": [ - "| epoch 1 | 200/ 3181 batches | lr 5.00 | ms/batch 101.32 | loss 9.07 | ppl 8713.01\n", - "| epoch 1 | 400/ 3181 batches | lr 5.00 | ms/batch 60.68 | loss 7.32 | ppl 1516.45\n", - "| epoch 1 | 600/ 3181 batches | lr 5.00 | ms/batch 60.86 | loss 6.78 | ppl 878.02\n", - "| epoch 1 | 800/ 3181 batches | lr 5.00 | ms/batch 60.93 | loss 6.44 | ppl 628.78\n", - "| epoch 1 | 1000/ 3181 batches | lr 5.00 | ms/batch 60.95 | loss 6.31 | ppl 551.05\n", - "| epoch 1 | 1200/ 3181 batches | lr 5.00 | ms/batch 60.99 | loss 6.19 | ppl 486.01\n", - "| epoch 1 | 1400/ 3181 batches | lr 5.00 | ms/batch 61.07 | loss 6.09 | ppl 441.65\n", - "| epoch 1 | 1600/ 3181 batches | lr 5.00 | ms/batch 61.08 | loss 6.07 | ppl 431.75\n", - "| epoch 1 | 1800/ 3181 batches | lr 5.00 | ms/batch 61.06 | loss 6.00 | ppl 403.14\n", - "| epoch 1 | 2000/ 3181 batches | lr 5.00 | ms/batch 61.16 | loss 5.91 | ppl 367.09\n", - "| epoch 1 | 2200/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.89 | ppl 359.65\n", - "| epoch 1 | 2400/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.86 | ppl 349.26\n", - "| epoch 1 | 2600/ 3181 batches | lr 5.00 | ms/batch 61.25 | loss 5.77 | ppl 319.99\n", - "| epoch 1 | 2800/ 3181 batches | lr 5.00 | ms/batch 61.29 | loss 5.80 | ppl 330.72\n", - "| epoch 1 | 3000/ 3181 batches | lr 5.00 | ms/batch 61.37 | loss 5.71 | ppl 303.17\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 1 | time: 213.85s | valid loss 5.73 | valid ppl 307.24\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 2 | 200/ 3181 batches | lr 4.75 | ms/batch 61.64 | loss 5.71 | ppl 302.70\n", - "| epoch 2 | 400/ 3181 batches | lr 4.75 | ms/batch 61.23 | loss 5.63 | ppl 279.76\n", - "| epoch 2 | 600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.63 | ppl 277.70\n", - "| epoch 2 | 800/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.56 | ppl 260.48\n", - "| epoch 2 | 1000/ 3181 batches | lr 4.75 | ms/batch 61.24 | loss 5.58 | ppl 266.00\n", - "| epoch 2 | 1200/ 3181 batches | lr 4.75 | ms/batch 61.37 | loss 5.55 | ppl 257.88\n", - "| epoch 2 | 1400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.53 | ppl 251.27\n", - "| epoch 2 | 1600/ 3181 batches | lr 4.75 | ms/batch 61.30 | loss 5.55 | ppl 255.98\n", - "| epoch 2 | 1800/ 3181 batches | lr 4.75 | ms/batch 61.29 | loss 5.53 | ppl 253.01\n", - "| epoch 2 | 2000/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.48 | ppl 238.90\n", - "| epoch 2 | 2200/ 3181 batches | lr 4.75 | ms/batch 61.35 | loss 5.46 | ppl 235.16\n", - "| epoch 2 | 2400/ 3181 batches | lr 4.75 | ms/batch 61.33 | loss 5.46 | ppl 235.11\n", - "| epoch 2 | 2600/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.40 | ppl 221.12\n", - "| epoch 2 | 2800/ 3181 batches | lr 4.75 | ms/batch 61.34 | loss 5.46 | ppl 234.30\n", - "| epoch 2 | 3000/ 3181 batches | lr 4.75 | ms/batch 61.28 | loss 5.37 | ppl 214.39\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 2 | time: 206.48s | valid loss 5.53 | valid ppl 252.22\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 3 | 200/ 3181 batches | lr 4.51 | ms/batch 61.62 | loss 5.42 | ppl 226.39\n", - "| epoch 3 | 400/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.36 | ppl 212.24\n", - "| epoch 3 | 600/ 3181 batches | lr 4.51 | ms/batch 61.31 | loss 5.34 | ppl 209.08\n", - "| epoch 3 | 800/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.31 | ppl 201.91\n", - "| epoch 3 | 1000/ 3181 batches | lr 4.51 | ms/batch 61.29 | loss 5.33 | ppl 207.08\n", - "| epoch 3 | 1200/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.30 | ppl 200.84\n", - "| epoch 3 | 1400/ 3181 batches | lr 4.51 | ms/batch 61.32 | loss 5.29 | ppl 198.48\n", - "| epoch 3 | 1600/ 3181 batches | lr 4.51 | ms/batch 61.30 | loss 5.31 | ppl 202.12\n", - "| epoch 3 | 1800/ 3181 batches | lr 4.51 | ms/batch 61.35 | loss 5.30 | ppl 200.79\n", - "| epoch 3 | 2000/ 3181 batches | lr 4.51 | ms/batch 61.33 | loss 5.26 | ppl 191.59\n", - "| epoch 3 | 2200/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.25 | ppl 190.89\n", - "| epoch 3 | 2400/ 3181 batches | lr 4.51 | ms/batch 61.39 | loss 5.25 | ppl 190.57\n", - "| epoch 3 | 2600/ 3181 batches | lr 4.51 | ms/batch 61.42 | loss 5.19 | ppl 180.17\n", - "| epoch 3 | 2800/ 3181 batches | lr 4.51 | ms/batch 61.38 | loss 5.26 | ppl 191.72\n", - "| epoch 3 | 3000/ 3181 batches | lr 4.51 | ms/batch 61.34 | loss 5.18 | ppl 177.08\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 3 | time: 206.57s | valid loss 5.44 | valid ppl 231.07\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 4 | 200/ 3181 batches | lr 4.29 | ms/batch 61.64 | loss 5.25 | ppl 190.59\n", - "| epoch 4 | 400/ 3181 batches | lr 4.29 | ms/batch 61.50 | loss 5.19 | ppl 178.85\n", - "| epoch 4 | 600/ 3181 batches | lr 4.29 | ms/batch 61.26 | loss 5.17 | ppl 176.66\n", - "| epoch 4 | 800/ 3181 batches | lr 4.29 | ms/batch 61.36 | loss 5.15 | ppl 172.78\n", - "| epoch 4 | 1000/ 3181 batches | lr 4.29 | ms/batch 61.32 | loss 5.19 | ppl 179.49\n", - "| epoch 4 | 1200/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.15 | ppl 172.65\n", - "| epoch 4 | 1400/ 3181 batches | lr 4.29 | ms/batch 61.39 | loss 5.14 | ppl 170.97\n", - "| epoch 4 | 1600/ 3181 batches | lr 4.29 | ms/batch 61.42 | loss 5.16 | ppl 174.44\n", - "| epoch 4 | 1800/ 3181 batches | lr 4.29 | ms/batch 61.40 | loss 5.16 | ppl 174.19\n", - "| epoch 4 | 2000/ 3181 batches | lr 4.29 | ms/batch 61.38 | loss 5.11 | ppl 166.50\n", - "| epoch 4 | 2200/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.10 | ppl 164.42\n", - "| epoch 4 | 2400/ 3181 batches | lr 4.29 | ms/batch 61.33 | loss 5.11 | ppl 165.60\n", - "| epoch 4 | 2600/ 3181 batches | lr 4.29 | ms/batch 61.41 | loss 5.06 | ppl 157.61\n", - "| epoch 4 | 2800/ 3181 batches | lr 4.29 | ms/batch 61.29 | loss 5.12 | ppl 167.69\n", - "| epoch 4 | 3000/ 3181 batches | lr 4.29 | ms/batch 61.37 | loss 5.05 | ppl 155.52\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 4 | time: 206.63s | valid loss 5.39 | valid ppl 218.67\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 5 | 200/ 3181 batches | lr 4.07 | ms/batch 61.64 | loss 5.13 | ppl 168.25\n", - "| epoch 5 | 400/ 3181 batches | lr 4.07 | ms/batch 61.30 | loss 5.06 | ppl 156.82\n", - "| epoch 5 | 600/ 3181 batches | lr 4.07 | ms/batch 61.38 | loss 5.04 | ppl 155.08\n", - "| epoch 5 | 800/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 5.03 | ppl 152.77\n", - "| epoch 5 | 1000/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.05 | ppl 156.69\n", - "| epoch 5 | 1200/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 5.02 | ppl 151.80\n", - "| epoch 5 | 1400/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 5.02 | ppl 151.68\n", - "| epoch 5 | 1600/ 3181 batches | lr 4.07 | ms/batch 61.37 | loss 5.03 | ppl 152.67\n", - "| epoch 5 | 1800/ 3181 batches | lr 4.07 | ms/batch 61.39 | loss 5.03 | ppl 152.77\n", - "| epoch 5 | 2000/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 4.99 | ppl 147.43\n", - "| epoch 5 | 2200/ 3181 batches | lr 4.07 | ms/batch 61.23 | loss 4.98 | ppl 145.22\n", - "| epoch 5 | 2400/ 3181 batches | lr 4.07 | ms/batch 61.33 | loss 4.99 | ppl 146.65\n", - "| epoch 5 | 2600/ 3181 batches | lr 4.07 | ms/batch 61.36 | loss 4.94 | ppl 140.06\n", - "| epoch 5 | 2800/ 3181 batches | lr 4.07 | ms/batch 61.35 | loss 5.01 | ppl 149.53\n", - "| epoch 5 | 3000/ 3181 batches | lr 4.07 | ms/batch 61.32 | loss 4.92 | ppl 136.69\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 5 | time: 206.57s | valid loss 5.41 | valid ppl 223.14\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 6 | 200/ 3181 batches | lr 3.87 | ms/batch 61.65 | loss 5.01 | ppl 149.89\n", - "| epoch 6 | 400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.94 | ppl 140.08\n", - "| epoch 6 | 600/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.93 | ppl 137.75\n" + "| epoch 1 | 200/13484 batches | lr 5.00 | ms/batch 116.24 | loss 9.27 | ppl 10651.55\n", + "| epoch 1 | 400/13484 batches | lr 5.00 | ms/batch 114.02 | loss 7.49 | ppl 1787.62\n", + "| epoch 1 | 600/13484 batches | lr 5.00 | ms/batch 114.33 | loss 6.83 | ppl 923.44\n", + "| epoch 1 | 800/13484 batches | lr 5.00 | ms/batch 114.54 | loss 6.54 | ppl 693.98\n", + "| epoch 1 | 1000/13484 batches | lr 5.00 | ms/batch 114.73 | loss 6.33 | ppl 563.29\n", + "| epoch 1 | 1200/13484 batches | lr 5.00 | ms/batch 114.85 | loss 6.18 | ppl 485.05\n", + "| epoch 1 | 1400/13484 batches | lr 5.00 | ms/batch 114.91 | loss 6.09 | ppl 440.69\n", + "| epoch 1 | 1600/13484 batches | lr 5.00 | ms/batch 115.00 | loss 6.06 | ppl 428.38\n", + "| epoch 1 | 1800/13484 batches | lr 5.00 | ms/batch 115.17 | loss 5.98 | ppl 397.07\n", + "| epoch 1 | 2000/13484 batches | lr 5.00 | ms/batch 115.17 | loss 5.91 | ppl 369.13\n", + "| epoch 1 | 2200/13484 batches | lr 5.00 | ms/batch 115.22 | loss 5.89 | ppl 360.14\n", + "| epoch 1 | 2400/13484 batches | lr 5.00 | ms/batch 115.20 | loss 5.83 | ppl 341.10\n", + "| epoch 1 | 2600/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.78 | ppl 322.33\n", + "| epoch 1 | 2800/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.80 | ppl 329.27\n", + "| epoch 1 | 3000/13484 batches | lr 5.00 | ms/batch 115.12 | loss 5.77 | ppl 321.64\n", + "| epoch 1 | 3200/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.71 | ppl 303.37\n", + "| epoch 1 | 3400/13484 batches | lr 5.00 | ms/batch 115.32 | loss 5.74 | ppl 311.04\n", + "| epoch 1 | 3600/13484 batches | lr 5.00 | ms/batch 115.15 | loss 5.70 | ppl 299.44\n", + "| epoch 1 | 3800/13484 batches | lr 5.00 | ms/batch 115.20 | loss 5.68 | ppl 292.67\n", + "| epoch 1 | 4000/13484 batches | lr 5.00 | ms/batch 115.32 | loss 5.59 | ppl 268.70\n", + "| epoch 1 | 4200/13484 batches | lr 5.00 | ms/batch 115.19 | loss 5.62 | ppl 275.23\n", + "| epoch 1 | 4400/13484 batches | lr 5.00 | ms/batch 115.32 | loss 5.63 | ppl 277.51\n", + "| epoch 1 | 4600/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.66 | ppl 286.99\n", + "| epoch 1 | 4800/13484 batches | lr 5.00 | ms/batch 115.30 | loss 5.62 | ppl 276.08\n", + "| epoch 1 | 5000/13484 batches | lr 5.00 | ms/batch 115.15 | loss 5.61 | ppl 272.68\n", + "| epoch 1 | 5200/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.59 | ppl 268.83\n", + "| epoch 1 | 5400/13484 batches | lr 5.00 | ms/batch 115.29 | loss 5.55 | ppl 257.80\n", + "| epoch 1 | 5600/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.57 | ppl 261.32\n", + "| epoch 1 | 5800/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.55 | ppl 257.06\n", + "| epoch 1 | 6000/13484 batches | lr 5.00 | ms/batch 115.26 | loss 5.56 | ppl 259.08\n", + "| epoch 1 | 6200/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.57 | ppl 262.89\n", + "| epoch 1 | 6400/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.54 | ppl 254.66\n", + "| epoch 1 | 6600/13484 batches | lr 5.00 | ms/batch 115.27 | loss 5.57 | ppl 263.01\n", + "| epoch 1 | 6800/13484 batches | lr 5.00 | ms/batch 115.21 | loss 5.51 | ppl 246.13\n", + "| epoch 1 | 7000/13484 batches | lr 5.00 | ms/batch 115.32 | loss 5.57 | ppl 261.50\n", + "| epoch 1 | 7200/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.51 | ppl 247.48\n", + "| epoch 1 | 7400/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.50 | ppl 245.45\n", + "| epoch 1 | 7600/13484 batches | lr 5.00 | ms/batch 115.26 | loss 5.51 | ppl 247.79\n", + "| epoch 1 | 7800/13484 batches | lr 5.00 | ms/batch 115.27 | loss 5.50 | ppl 245.74\n", + "| epoch 1 | 8000/13484 batches | lr 5.00 | ms/batch 115.33 | loss 5.48 | ppl 240.49\n", + "| epoch 1 | 8200/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.48 | ppl 238.87\n", + "| epoch 1 | 8400/13484 batches | lr 5.00 | ms/batch 115.27 | loss 5.49 | ppl 241.45\n", + "| epoch 1 | 8600/13484 batches | lr 5.00 | ms/batch 115.23 | loss 5.47 | ppl 236.88\n", + "| epoch 1 | 8800/13484 batches | lr 5.00 | ms/batch 115.28 | loss 5.47 | ppl 236.31\n", + "| epoch 1 | 9000/13484 batches | lr 5.00 | ms/batch 115.20 | loss 5.48 | ppl 240.63\n", + "| epoch 1 | 9200/13484 batches | lr 5.00 | ms/batch 115.22 | loss 5.48 | ppl 239.53\n", + "| epoch 1 | 9400/13484 batches | lr 5.00 | ms/batch 115.29 | loss 5.48 | ppl 238.75\n", + "| epoch 1 | 9600/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.43 | ppl 229.14\n", + "| epoch 1 | 9800/13484 batches | lr 5.00 | ms/batch 115.22 | loss 5.42 | ppl 226.49\n", + "| epoch 1 | 10000/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.47 | ppl 236.79\n", + "| epoch 1 | 10200/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.41 | ppl 223.98\n", + "| epoch 1 | 10400/13484 batches | lr 5.00 | ms/batch 115.16 | loss 5.39 | ppl 219.63\n", + "| epoch 1 | 10600/13484 batches | lr 5.00 | ms/batch 115.22 | loss 5.42 | ppl 225.37\n", + "| epoch 1 | 10800/13484 batches | lr 5.00 | ms/batch 115.30 | loss 5.45 | ppl 232.44\n", + "| epoch 1 | 11000/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.45 | ppl 232.12\n", + "| epoch 1 | 11200/13484 batches | lr 5.00 | ms/batch 115.21 | loss 5.43 | ppl 228.71\n", + "| epoch 1 | 11400/13484 batches | lr 5.00 | ms/batch 115.32 | loss 5.38 | ppl 216.73\n", + "| epoch 1 | 11600/13484 batches | lr 5.00 | ms/batch 115.17 | loss 5.41 | ppl 222.68\n", + "| epoch 1 | 11800/13484 batches | lr 5.00 | ms/batch 115.28 | loss 5.39 | ppl 218.39\n", + "| epoch 1 | 12000/13484 batches | lr 5.00 | ms/batch 115.17 | loss 5.44 | ppl 229.94\n", + "| epoch 1 | 12200/13484 batches | lr 5.00 | ms/batch 115.20 | loss 5.36 | ppl 213.26\n", + "| epoch 1 | 12400/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.38 | ppl 217.41\n", + "| epoch 1 | 12600/13484 batches | lr 5.00 | ms/batch 115.24 | loss 5.40 | ppl 222.35\n", + "| epoch 1 | 12800/13484 batches | lr 5.00 | ms/batch 115.22 | loss 5.41 | ppl 224.63\n", + "| epoch 1 | 13000/13484 batches | lr 5.00 | ms/batch 115.29 | loss 5.40 | ppl 220.79\n", + "| epoch 1 | 13200/13484 batches | lr 5.00 | ms/batch 115.16 | loss 5.41 | ppl 223.58\n", + "| epoch 1 | 13400/13484 batches | lr 5.00 | ms/batch 115.25 | loss 5.42 | ppl 225.49\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 1 | time: 1625.43s | valid loss 5.35 | valid ppl 210.93\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 2 | 200/13484 batches | lr 4.75 | ms/batch 115.84 | loss 5.44 | ppl 229.80\n", + "| epoch 2 | 400/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.38 | ppl 216.74\n", + "| epoch 2 | 600/13484 batches | lr 4.75 | ms/batch 115.21 | loss 5.35 | ppl 211.15\n", + "| epoch 2 | 800/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.37 | ppl 215.74\n", + "| epoch 2 | 1000/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.35 | ppl 210.96\n", + "| epoch 2 | 1200/13484 batches | lr 4.75 | ms/batch 115.17 | loss 5.33 | ppl 207.12\n", + "| epoch 2 | 1400/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.34 | ppl 208.70\n", + "| epoch 2 | 1600/13484 batches | lr 4.75 | ms/batch 115.14 | loss 5.36 | ppl 212.80\n", + "| epoch 2 | 1800/13484 batches | lr 4.75 | ms/batch 115.11 | loss 5.35 | ppl 209.96\n", + "| epoch 2 | 2000/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.32 | ppl 203.54\n", + "| epoch 2 | 2200/13484 batches | lr 4.75 | ms/batch 115.15 | loss 5.33 | ppl 205.82\n", + "| epoch 2 | 2400/13484 batches | lr 4.75 | ms/batch 115.23 | loss 5.34 | ppl 208.95\n", + "| epoch 2 | 2600/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.29 | ppl 199.16\n", + "| epoch 2 | 2800/13484 batches | lr 4.75 | ms/batch 115.16 | loss 5.34 | ppl 208.19\n", + "| epoch 2 | 3000/13484 batches | lr 4.75 | ms/batch 115.18 | loss 5.33 | ppl 205.88\n", + "| epoch 2 | 3200/13484 batches | lr 4.75 | ms/batch 115.26 | loss 5.29 | ppl 198.11\n", + "| epoch 2 | 3400/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.37 | ppl 214.29\n", + "| epoch 2 | 3600/13484 batches | lr 4.75 | ms/batch 115.26 | loss 5.31 | ppl 202.72\n", + "| epoch 2 | 3800/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.32 | ppl 203.84\n", + "| epoch 2 | 4000/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.24 | ppl 189.14\n", + "| epoch 2 | 4200/13484 batches | lr 4.75 | ms/batch 115.15 | loss 5.28 | ppl 196.95\n", + "| epoch 2 | 4400/13484 batches | lr 4.75 | ms/batch 115.17 | loss 5.29 | ppl 198.84\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 6 | 800/ 3181 batches | lr 3.87 | ms/batch 61.31 | loss 4.91 | ppl 135.89\n", - "| epoch 6 | 1000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.95 | ppl 141.06\n", - "| epoch 6 | 1200/ 3181 batches | lr 3.87 | ms/batch 61.38 | loss 4.90 | ppl 134.49\n", - "| epoch 6 | 1400/ 3181 batches | lr 3.87 | ms/batch 61.33 | loss 4.91 | ppl 135.28\n", - "| epoch 6 | 1600/ 3181 batches | lr 3.87 | ms/batch 61.37 | loss 4.91 | ppl 136.26\n", - "| epoch 6 | 1800/ 3181 batches | lr 3.87 | ms/batch 61.36 | loss 4.93 | ppl 137.81\n", - "| epoch 6 | 2000/ 3181 batches | lr 3.87 | ms/batch 61.40 | loss 4.88 | ppl 131.80\n", - "| epoch 6 | 2200/ 3181 batches | lr 3.87 | ms/batch 61.43 | loss 4.87 | ppl 130.59\n", - "| epoch 6 | 2400/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.87 | ppl 130.95\n", - "| epoch 6 | 2600/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.83 | ppl 125.04\n", - "| epoch 6 | 2800/ 3181 batches | lr 3.87 | ms/batch 61.35 | loss 4.91 | ppl 135.49\n", - "| epoch 6 | 3000/ 3181 batches | lr 3.87 | ms/batch 61.29 | loss 4.81 | ppl 122.53\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 6 | time: 206.60s | valid loss 5.37 | valid ppl 214.67\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 7 | 200/ 3181 batches | lr 3.68 | ms/batch 61.66 | loss 4.91 | ppl 135.30\n", - "| epoch 7 | 400/ 3181 batches | lr 3.68 | ms/batch 61.23 | loss 4.83 | ppl 125.76\n", - "| epoch 7 | 600/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.52\n", - "| epoch 7 | 800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.82 | ppl 123.59\n", - "| epoch 7 | 1000/ 3181 batches | lr 3.68 | ms/batch 61.35 | loss 4.85 | ppl 127.65\n", - "| epoch 7 | 1200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.81 | ppl 122.34\n", - "| epoch 7 | 1400/ 3181 batches | lr 3.68 | ms/batch 61.38 | loss 4.81 | ppl 123.22\n", - "| epoch 7 | 1600/ 3181 batches | lr 3.68 | ms/batch 61.34 | loss 4.82 | ppl 123.82\n", - "| epoch 7 | 1800/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.83 | ppl 125.22\n", - "| epoch 7 | 2000/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.79 | ppl 119.76\n", - "| epoch 7 | 2200/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.78 | ppl 118.99\n", - "| epoch 7 | 2400/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.78 | ppl 118.59\n", - "| epoch 7 | 2600/ 3181 batches | lr 3.68 | ms/batch 61.36 | loss 4.73 | ppl 113.60\n", - "| epoch 7 | 2800/ 3181 batches | lr 3.68 | ms/batch 61.33 | loss 4.80 | ppl 122.09\n", - "| epoch 7 | 3000/ 3181 batches | lr 3.68 | ms/batch 61.41 | loss 4.71 | ppl 111.19\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 7 | time: 206.61s | valid loss 5.35 | valid ppl 210.55\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 8 | 200/ 3181 batches | lr 3.49 | ms/batch 61.69 | loss 4.81 | ppl 122.20\n", - "| epoch 8 | 400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.74 | ppl 114.11\n", - "| epoch 8 | 600/ 3181 batches | lr 3.49 | ms/batch 61.41 | loss 4.73 | ppl 113.82\n", - "| epoch 8 | 800/ 3181 batches | lr 3.49 | ms/batch 61.35 | loss 4.73 | ppl 113.04\n", - "| epoch 8 | 1000/ 3181 batches | lr 3.49 | ms/batch 61.46 | loss 4.75 | ppl 115.84\n", - "| epoch 8 | 1200/ 3181 batches | lr 3.49 | ms/batch 61.43 | loss 4.71 | ppl 111.58\n", - "| epoch 8 | 1400/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.72 | ppl 111.84\n", - "| epoch 8 | 1600/ 3181 batches | lr 3.49 | ms/batch 61.39 | loss 4.72 | ppl 112.52\n", - "| epoch 8 | 1800/ 3181 batches | lr 3.49 | ms/batch 61.44 | loss 4.74 | ppl 114.44\n", - "| epoch 8 | 2000/ 3181 batches | lr 3.49 | ms/batch 61.37 | loss 4.70 | ppl 109.63\n", - "| epoch 8 | 2200/ 3181 batches | lr 3.49 | ms/batch 61.31 | loss 4.68 | ppl 108.29\n", - "| epoch 8 | 2400/ 3181 batches | lr 3.49 | ms/batch 61.28 | loss 4.69 | ppl 108.78\n", - "| epoch 8 | 2600/ 3181 batches | lr 3.49 | ms/batch 61.30 | loss 4.64 | ppl 103.90\n", - "| epoch 8 | 2800/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.72 | ppl 111.83\n", - "| epoch 8 | 3000/ 3181 batches | lr 3.49 | ms/batch 61.33 | loss 4.62 | ppl 101.24\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 8 | time: 206.60s | valid loss 5.34 | valid ppl 208.08\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 9 | 200/ 3181 batches | lr 3.32 | ms/batch 61.64 | loss 4.72 | ppl 111.95\n", - "| epoch 9 | 400/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.65 | ppl 104.38\n", - "| epoch 9 | 600/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.97\n", - "| epoch 9 | 800/ 3181 batches | lr 3.32 | ms/batch 61.32 | loss 4.64 | ppl 103.60\n", - "| epoch 9 | 1000/ 3181 batches | lr 3.32 | ms/batch 61.40 | loss 4.68 | ppl 107.40\n", - "| epoch 9 | 1200/ 3181 batches | lr 3.32 | ms/batch 61.39 | loss 4.62 | ppl 101.89\n", - "| epoch 9 | 1400/ 3181 batches | lr 3.32 | ms/batch 61.33 | loss 4.64 | ppl 103.60\n", - "| epoch 9 | 1600/ 3181 batches | lr 3.32 | ms/batch 61.30 | loss 4.64 | ppl 103.54\n", - "| epoch 9 | 1800/ 3181 batches | lr 3.32 | ms/batch 61.31 | loss 4.66 | ppl 105.35\n", - "| epoch 9 | 2000/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.62 | ppl 101.24\n", - "| epoch 9 | 2200/ 3181 batches | lr 3.32 | ms/batch 61.28 | loss 4.60 | ppl 99.91\n", - "| epoch 9 | 2400/ 3181 batches | lr 3.32 | ms/batch 61.34 | loss 4.61 | ppl 100.17\n", - "| epoch 9 | 2600/ 3181 batches | lr 3.32 | ms/batch 61.36 | loss 4.56 | ppl 95.58\n", - "| epoch 9 | 2800/ 3181 batches | lr 3.32 | ms/batch 61.43 | loss 4.63 | ppl 102.81\n", - "| epoch 9 | 3000/ 3181 batches | lr 3.32 | ms/batch 61.38 | loss 4.54 | ppl 93.66\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 9 | time: 206.58s | valid loss 5.35 | valid ppl 209.83\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 10 | 200/ 3181 batches | lr 3.15 | ms/batch 61.64 | loss 4.64 | ppl 103.90\n", - "| epoch 10 | 400/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.88\n", - "| epoch 10 | 600/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.86\n", - "| epoch 10 | 800/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.56 | ppl 95.84\n", - "| epoch 10 | 1000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.59 | ppl 98.74\n", - "| epoch 10 | 1200/ 3181 batches | lr 3.15 | ms/batch 61.32 | loss 4.55 | ppl 94.35\n", - "| epoch 10 | 1400/ 3181 batches | lr 3.15 | ms/batch 61.27 | loss 4.56 | ppl 95.77\n", - "| epoch 10 | 1600/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.55 | ppl 94.76\n", - "| epoch 10 | 1800/ 3181 batches | lr 3.15 | ms/batch 61.37 | loss 4.57 | ppl 96.99\n", - "| epoch 10 | 2000/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.54 | ppl 93.41\n", - "| epoch 10 | 2200/ 3181 batches | lr 3.15 | ms/batch 61.29 | loss 4.53 | ppl 92.30\n", - "| epoch 10 | 2400/ 3181 batches | lr 3.15 | ms/batch 61.36 | loss 4.53 | ppl 92.36\n", - "| epoch 10 | 2600/ 3181 batches | lr 3.15 | ms/batch 61.34 | loss 4.48 | ppl 88.41\n", - "| epoch 10 | 2800/ 3181 batches | lr 3.15 | ms/batch 61.35 | loss 4.56 | ppl 95.78\n", - "| epoch 10 | 3000/ 3181 batches | lr 3.15 | ms/batch 61.33 | loss 4.46 | ppl 86.56\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 10 | time: 206.54s | valid loss 5.38 | valid ppl 216.73\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 11 | 200/ 3181 batches | lr 2.99 | ms/batch 61.65 | loss 4.57 | ppl 96.07\n", - "| epoch 11 | 400/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.50 | ppl 89.75\n", - "| epoch 11 | 600/ 3181 batches | lr 2.99 | ms/batch 61.39 | loss 4.49 | ppl 88.98\n", - "| epoch 11 | 800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.49 | ppl 89.43\n", - "| epoch 11 | 1000/ 3181 batches | lr 2.99 | ms/batch 61.36 | loss 4.52 | ppl 92.09\n", - "| epoch 11 | 1200/ 3181 batches | lr 2.99 | ms/batch 61.45 | loss 4.47 | ppl 87.68\n" + "| epoch 2 | 4600/13484 batches | lr 4.75 | ms/batch 115.18 | loss 5.35 | ppl 210.15\n", + "| epoch 2 | 4800/13484 batches | lr 4.75 | ms/batch 115.24 | loss 5.32 | ppl 204.37\n", + "| epoch 2 | 5000/13484 batches | lr 4.75 | ms/batch 115.29 | loss 5.33 | ppl 205.42\n", + "| epoch 2 | 5200/13484 batches | lr 4.75 | ms/batch 115.24 | loss 5.31 | ppl 201.44\n", + "| epoch 2 | 5400/13484 batches | lr 4.75 | ms/batch 115.23 | loss 5.30 | ppl 200.48\n", + "| epoch 2 | 5600/13484 batches | lr 4.75 | ms/batch 115.26 | loss 5.29 | ppl 197.76\n", + "| epoch 2 | 5800/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.34 | ppl 207.65\n", + "| epoch 2 | 6000/13484 batches | lr 4.75 | ms/batch 115.11 | loss 5.32 | ppl 204.89\n", + "| epoch 2 | 6200/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.34 | ppl 209.24\n", + "| epoch 2 | 6400/13484 batches | lr 4.75 | ms/batch 115.14 | loss 5.31 | ppl 201.48\n", + "| epoch 2 | 6600/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.36 | ppl 212.87\n", + "| epoch 2 | 6800/13484 batches | lr 4.75 | ms/batch 115.13 | loss 5.29 | ppl 198.41\n", + "| epoch 2 | 7000/13484 batches | lr 4.75 | ms/batch 115.16 | loss 5.35 | ppl 211.39\n", + "| epoch 2 | 7200/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.30 | ppl 199.94\n", + "| epoch 2 | 7400/13484 batches | lr 4.75 | ms/batch 115.11 | loss 5.30 | ppl 200.81\n", + "| epoch 2 | 7600/13484 batches | lr 4.75 | ms/batch 115.31 | loss 5.35 | ppl 211.20\n", + "| epoch 2 | 7800/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.31 | ppl 201.93\n", + "| epoch 2 | 8000/13484 batches | lr 4.75 | ms/batch 115.21 | loss 5.29 | ppl 198.24\n", + "| epoch 2 | 8200/13484 batches | lr 4.75 | ms/batch 115.14 | loss 5.27 | ppl 194.75\n", + "| epoch 2 | 8400/13484 batches | lr 4.75 | ms/batch 115.21 | loss 5.29 | ppl 198.48\n", + "| epoch 2 | 8600/13484 batches | lr 4.75 | ms/batch 115.13 | loss 5.29 | ppl 198.11\n", + "| epoch 2 | 8800/13484 batches | lr 4.75 | ms/batch 115.24 | loss 5.34 | ppl 207.62\n", + "| epoch 2 | 9000/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.33 | ppl 205.55\n", + "| epoch 2 | 9200/13484 batches | lr 4.75 | ms/batch 115.27 | loss 5.33 | ppl 206.24\n", + "| epoch 2 | 9400/13484 batches | lr 4.75 | ms/batch 115.18 | loss 5.31 | ppl 201.81\n", + "| epoch 2 | 9600/13484 batches | lr 4.75 | ms/batch 115.25 | loss 5.29 | ppl 198.63\n", + "| epoch 2 | 9800/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.26 | ppl 192.87\n", + "| epoch 2 | 10000/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.30 | ppl 199.77\n", + "| epoch 2 | 10200/13484 batches | lr 4.75 | ms/batch 115.24 | loss 5.25 | ppl 191.30\n", + "| epoch 2 | 10400/13484 batches | lr 4.75 | ms/batch 115.24 | loss 5.22 | ppl 184.78\n", + "| epoch 2 | 10600/13484 batches | lr 4.75 | ms/batch 115.20 | loss 5.27 | ppl 194.07\n", + "| epoch 2 | 10800/13484 batches | lr 4.75 | ms/batch 115.23 | loss 5.30 | ppl 200.53\n", + "| epoch 2 | 11000/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.29 | ppl 198.68\n", + "| epoch 2 | 11200/13484 batches | lr 4.75 | ms/batch 115.21 | loss 5.28 | ppl 196.43\n", + "| epoch 2 | 11400/13484 batches | lr 4.75 | ms/batch 115.26 | loss 5.23 | ppl 186.61\n", + "| epoch 2 | 11600/13484 batches | lr 4.75 | ms/batch 115.13 | loss 5.27 | ppl 195.11\n", + "| epoch 2 | 11800/13484 batches | lr 4.75 | ms/batch 115.19 | loss 5.23 | ppl 186.19\n", + "| epoch 2 | 12000/13484 batches | lr 4.75 | ms/batch 115.22 | loss 5.31 | ppl 202.19\n", + "| epoch 2 | 12200/13484 batches | lr 4.75 | ms/batch 115.18 | loss 5.22 | ppl 184.46\n", + "| epoch 2 | 12400/13484 batches | lr 4.75 | ms/batch 115.32 | loss 5.23 | ppl 187.26\n", + "| epoch 2 | 12600/13484 batches | lr 4.75 | ms/batch 115.31 | loss 5.25 | ppl 189.65\n", + "| epoch 2 | 12800/13484 batches | lr 4.75 | ms/batch 115.26 | loss 5.28 | ppl 196.25\n", + "| epoch 2 | 13000/13484 batches | lr 4.75 | ms/batch 115.35 | loss 5.28 | ppl 196.31\n", + "| epoch 2 | 13200/13484 batches | lr 4.75 | ms/batch 115.32 | loss 5.28 | ppl 195.61\n", + "| epoch 2 | 13400/13484 batches | lr 4.75 | ms/batch 115.27 | loss 5.28 | ppl 195.80\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 2 | time: 1625.71s | valid loss 5.24 | valid ppl 188.48\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 3 | 200/13484 batches | lr 4.51 | ms/batch 115.84 | loss 5.32 | ppl 205.41\n", + "| epoch 3 | 400/13484 batches | lr 4.51 | ms/batch 115.17 | loss 5.28 | ppl 195.56\n", + "| epoch 3 | 600/13484 batches | lr 4.51 | ms/batch 115.12 | loss 5.22 | ppl 184.23\n", + "| epoch 3 | 800/13484 batches | lr 4.51 | ms/batch 115.21 | loss 5.23 | ppl 187.41\n", + "| epoch 3 | 1000/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.23 | ppl 186.77\n", + "| epoch 3 | 1200/13484 batches | lr 4.51 | ms/batch 115.14 | loss 5.22 | ppl 184.68\n", + "| epoch 3 | 1400/13484 batches | lr 4.51 | ms/batch 115.18 | loss 5.20 | ppl 181.17\n", + "| epoch 3 | 1600/13484 batches | lr 4.51 | ms/batch 115.27 | loss 5.25 | ppl 191.20\n", + "| epoch 3 | 1800/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.23 | ppl 186.87\n", + "| epoch 3 | 2000/13484 batches | lr 4.51 | ms/batch 115.23 | loss 5.19 | ppl 180.16\n", + "| epoch 3 | 2200/13484 batches | lr 4.51 | ms/batch 115.27 | loss 5.21 | ppl 183.82\n", + "| epoch 3 | 2400/13484 batches | lr 4.51 | ms/batch 115.20 | loss 5.21 | ppl 182.76\n", + "| epoch 3 | 2600/13484 batches | lr 4.51 | ms/batch 115.21 | loss 5.19 | ppl 180.25\n", + "| epoch 3 | 2800/13484 batches | lr 4.51 | ms/batch 115.28 | loss 5.22 | ppl 185.75\n", + "| epoch 3 | 3000/13484 batches | lr 4.51 | ms/batch 115.19 | loss 5.21 | ppl 183.06\n", + "| epoch 3 | 3200/13484 batches | lr 4.51 | ms/batch 115.23 | loss 5.17 | ppl 176.28\n", + "| epoch 3 | 3400/13484 batches | lr 4.51 | ms/batch 115.16 | loss 5.24 | ppl 187.88\n", + "| epoch 3 | 3600/13484 batches | lr 4.51 | ms/batch 115.16 | loss 5.21 | ppl 182.87\n", + "| epoch 3 | 3800/13484 batches | lr 4.51 | ms/batch 115.18 | loss 5.21 | ppl 182.52\n", + "| epoch 3 | 4000/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.15 | ppl 172.43\n", + "| epoch 3 | 4200/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.18 | ppl 177.72\n", + "| epoch 3 | 4400/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.19 | ppl 179.22\n", + "| epoch 3 | 4600/13484 batches | lr 4.51 | ms/batch 115.26 | loss 5.24 | ppl 187.99\n", + "| epoch 3 | 4800/13484 batches | lr 4.51 | ms/batch 115.17 | loss 5.24 | ppl 188.20\n", + "| epoch 3 | 5000/13484 batches | lr 4.51 | ms/batch 115.20 | loss 5.22 | ppl 184.24\n", + "| epoch 3 | 5200/13484 batches | lr 4.51 | ms/batch 115.24 | loss 5.21 | ppl 184.01\n", + "| epoch 3 | 5400/13484 batches | lr 4.51 | ms/batch 115.24 | loss 5.19 | ppl 178.94\n", + "| epoch 3 | 5600/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.19 | ppl 180.15\n", + "| epoch 3 | 5800/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.20 | ppl 181.24\n", + "| epoch 3 | 6000/13484 batches | lr 4.51 | ms/batch 115.19 | loss 5.22 | ppl 184.08\n", + "| epoch 3 | 6200/13484 batches | lr 4.51 | ms/batch 115.29 | loss 5.24 | ppl 187.77\n", + "| epoch 3 | 6400/13484 batches | lr 4.51 | ms/batch 115.19 | loss 5.21 | ppl 182.36\n", + "| epoch 3 | 6600/13484 batches | lr 4.51 | ms/batch 115.23 | loss 5.25 | ppl 190.52\n", + "| epoch 3 | 6800/13484 batches | lr 4.51 | ms/batch 115.27 | loss 5.20 | ppl 180.56\n", + "| epoch 3 | 7000/13484 batches | lr 4.51 | ms/batch 115.14 | loss 5.23 | ppl 186.73\n", + "| epoch 3 | 7200/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.19 | ppl 179.90\n", + "| epoch 3 | 7400/13484 batches | lr 4.51 | ms/batch 115.28 | loss 5.21 | ppl 182.43\n", + "| epoch 3 | 7600/13484 batches | lr 4.51 | ms/batch 115.09 | loss 5.20 | ppl 181.48\n", + "| epoch 3 | 7800/13484 batches | lr 4.51 | ms/batch 115.26 | loss 5.22 | ppl 185.25\n", + "| epoch 3 | 8000/13484 batches | lr 4.51 | ms/batch 115.27 | loss 5.18 | ppl 178.05\n", + "| epoch 3 | 8200/13484 batches | lr 4.51 | ms/batch 115.20 | loss 5.18 | ppl 178.41\n", + "| epoch 3 | 8400/13484 batches | lr 4.51 | ms/batch 115.24 | loss 5.20 | ppl 181.07\n", + "| epoch 3 | 8600/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.20 | ppl 182.10\n", + "| epoch 3 | 8800/13484 batches | lr 4.51 | ms/batch 115.28 | loss 5.18 | ppl 177.86\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 11 | 1400/ 3181 batches | lr 2.99 | ms/batch 61.38 | loss 4.49 | ppl 89.02\n", - "| epoch 11 | 1600/ 3181 batches | lr 2.99 | ms/batch 61.41 | loss 4.49 | ppl 89.15\n", - "| epoch 11 | 1800/ 3181 batches | lr 2.99 | ms/batch 61.33 | loss 4.50 | ppl 90.22\n", - "| epoch 11 | 2000/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.46 | ppl 86.81\n", - "| epoch 11 | 2200/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.45 | ppl 85.80\n", - "| epoch 11 | 2400/ 3181 batches | lr 2.99 | ms/batch 61.35 | loss 4.46 | ppl 86.48\n", - "| epoch 11 | 2600/ 3181 batches | lr 2.99 | ms/batch 61.30 | loss 4.41 | ppl 82.18\n", - "| epoch 11 | 2800/ 3181 batches | lr 2.99 | ms/batch 61.40 | loss 4.48 | ppl 88.42\n", - "| epoch 11 | 3000/ 3181 batches | lr 2.99 | ms/batch 61.42 | loss 4.39 | ppl 80.87\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 11 | time: 206.64s | valid loss 5.39 | valid ppl 219.73\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 12 | 200/ 3181 batches | lr 2.84 | ms/batch 61.75 | loss 4.50 | ppl 89.97\n", - "| epoch 12 | 400/ 3181 batches | lr 2.84 | ms/batch 61.42 | loss 4.43 | ppl 84.04\n", - "| epoch 12 | 600/ 3181 batches | lr 2.84 | ms/batch 61.45 | loss 4.42 | ppl 83.14\n", - "| epoch 12 | 800/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.42 | ppl 83.42\n", - "| epoch 12 | 1000/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.46 | ppl 86.36\n", - "| epoch 12 | 1200/ 3181 batches | lr 2.84 | ms/batch 61.37 | loss 4.41 | ppl 82.13\n", - "| epoch 12 | 1400/ 3181 batches | lr 2.84 | ms/batch 61.32 | loss 4.42 | ppl 83.46\n", - "| epoch 12 | 1600/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.42 | ppl 82.96\n", - "| epoch 12 | 1800/ 3181 batches | lr 2.84 | ms/batch 61.38 | loss 4.44 | ppl 84.42\n", - "| epoch 12 | 2000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.40 | ppl 81.54\n", - "| epoch 12 | 2200/ 3181 batches | lr 2.84 | ms/batch 61.36 | loss 4.39 | ppl 80.50\n", - "| epoch 12 | 2400/ 3181 batches | lr 2.84 | ms/batch 61.35 | loss 4.39 | ppl 80.92\n", - "| epoch 12 | 2600/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.35 | ppl 77.30\n", - "| epoch 12 | 2800/ 3181 batches | lr 2.84 | ms/batch 61.39 | loss 4.42 | ppl 83.09\n", - "| epoch 12 | 3000/ 3181 batches | lr 2.84 | ms/batch 61.40 | loss 4.33 | ppl 75.78\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 12 | time: 206.67s | valid loss 5.42 | valid ppl 224.91\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 13 | 200/ 3181 batches | lr 2.70 | ms/batch 61.68 | loss 4.43 | ppl 83.99\n", - "| epoch 13 | 400/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.36 | ppl 78.48\n", - "| epoch 13 | 600/ 3181 batches | lr 2.70 | ms/batch 61.33 | loss 4.35 | ppl 77.76\n", - "| epoch 13 | 800/ 3181 batches | lr 2.70 | ms/batch 61.31 | loss 4.37 | ppl 78.88\n", - "| epoch 13 | 1000/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.39 | ppl 80.64\n", - "| epoch 13 | 1200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.34 | ppl 76.95\n", - "| epoch 13 | 1400/ 3181 batches | lr 2.70 | ms/batch 61.41 | loss 4.36 | ppl 78.49\n", - "| epoch 13 | 1600/ 3181 batches | lr 2.70 | ms/batch 61.35 | loss 4.36 | ppl 77.93\n", - "| epoch 13 | 1800/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.37 | ppl 79.08\n", - "| epoch 13 | 2000/ 3181 batches | lr 2.70 | ms/batch 61.34 | loss 4.34 | ppl 76.68\n", - "| epoch 13 | 2200/ 3181 batches | lr 2.70 | ms/batch 61.37 | loss 4.32 | ppl 75.17\n", - "| epoch 13 | 2400/ 3181 batches | lr 2.70 | ms/batch 61.38 | loss 4.33 | ppl 75.87\n", - "| epoch 13 | 2600/ 3181 batches | lr 2.70 | ms/batch 61.29 | loss 4.28 | ppl 72.20\n", - "| epoch 13 | 2800/ 3181 batches | lr 2.70 | ms/batch 61.27 | loss 4.36 | ppl 78.01\n", - "| epoch 13 | 3000/ 3181 batches | lr 2.70 | ms/batch 61.36 | loss 4.26 | ppl 70.91\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 13 | time: 206.58s | valid loss 5.42 | valid ppl 225.31\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 14 | 200/ 3181 batches | lr 2.57 | ms/batch 61.59 | loss 4.37 | ppl 79.39\n", - "| epoch 14 | 400/ 3181 batches | lr 2.57 | ms/batch 61.37 | loss 4.30 | ppl 73.94\n", - "| epoch 14 | 600/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.30 | ppl 73.50\n", - "| epoch 14 | 800/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.31 | ppl 74.12\n", - "| epoch 14 | 1000/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.33 | ppl 75.86\n", - "| epoch 14 | 1200/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.29 | ppl 72.64\n", - "| epoch 14 | 1400/ 3181 batches | lr 2.57 | ms/batch 61.39 | loss 4.31 | ppl 74.29\n", - "| epoch 14 | 1600/ 3181 batches | lr 2.57 | ms/batch 61.31 | loss 4.29 | ppl 73.17\n", - "| epoch 14 | 1800/ 3181 batches | lr 2.57 | ms/batch 61.41 | loss 4.31 | ppl 74.28\n", - "| epoch 14 | 2000/ 3181 batches | lr 2.57 | ms/batch 61.34 | loss 4.28 | ppl 71.97\n", - "| epoch 14 | 2200/ 3181 batches | lr 2.57 | ms/batch 61.44 | loss 4.26 | ppl 71.13\n", - "| epoch 14 | 2400/ 3181 batches | lr 2.57 | ms/batch 61.32 | loss 4.27 | ppl 71.61\n", - "| epoch 14 | 2600/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.22 | ppl 67.93\n", - "| epoch 14 | 2800/ 3181 batches | lr 2.57 | ms/batch 61.42 | loss 4.30 | ppl 73.68\n", - "| epoch 14 | 3000/ 3181 batches | lr 2.57 | ms/batch 61.36 | loss 4.21 | ppl 67.08\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 14 | time: 206.63s | valid loss 5.47 | valid ppl 236.36\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 15 | 200/ 3181 batches | lr 2.44 | ms/batch 61.66 | loss 4.32 | ppl 75.20\n", - "| epoch 15 | 400/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.25 | ppl 69.78\n", - "| epoch 15 | 600/ 3181 batches | lr 2.44 | ms/batch 61.30 | loss 4.23 | ppl 68.98\n", - "| epoch 15 | 800/ 3181 batches | lr 2.44 | ms/batch 61.34 | loss 4.25 | ppl 70.20\n", - "| epoch 15 | 1000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.28 | ppl 71.96\n", - "| epoch 15 | 1200/ 3181 batches | lr 2.44 | ms/batch 61.29 | loss 4.23 | ppl 68.62\n", - "| epoch 15 | 1400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 70.18\n", - "| epoch 15 | 1600/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.23 | ppl 68.99\n", - "| epoch 15 | 1800/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.25 | ppl 69.87\n", - "| epoch 15 | 2000/ 3181 batches | lr 2.44 | ms/batch 61.36 | loss 4.22 | ppl 67.79\n", - "| epoch 15 | 2200/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.21 | ppl 67.21\n", - "| epoch 15 | 2400/ 3181 batches | lr 2.44 | ms/batch 61.39 | loss 4.21 | ppl 67.61\n", - "| epoch 15 | 2600/ 3181 batches | lr 2.44 | ms/batch 61.40 | loss 4.15 | ppl 63.73\n", - "| epoch 15 | 2800/ 3181 batches | lr 2.44 | ms/batch 61.37 | loss 4.24 | ppl 69.43\n", - "| epoch 15 | 3000/ 3181 batches | lr 2.44 | ms/batch 61.38 | loss 4.15 | ppl 63.16\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 15 | time: 206.62s | valid loss 5.47 | valid ppl 238.57\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 16 | 200/ 3181 batches | lr 2.32 | ms/batch 61.60 | loss 4.26 | ppl 71.14\n", - "| epoch 16 | 400/ 3181 batches | lr 2.32 | ms/batch 61.33 | loss 4.19 | ppl 65.93\n", - "| epoch 16 | 600/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.18 | ppl 65.22\n", - "| epoch 16 | 800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.07\n", - "| epoch 16 | 1000/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.22 | ppl 68.20\n", - "| epoch 16 | 1200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.17 | ppl 65.03\n", - "| epoch 16 | 1400/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.20 | ppl 66.72\n", - "| epoch 16 | 1600/ 3181 batches | lr 2.32 | ms/batch 61.38 | loss 4.19 | ppl 65.70\n", - "| epoch 16 | 1800/ 3181 batches | lr 2.32 | ms/batch 61.37 | loss 4.19 | ppl 66.27\n" + "| epoch 3 | 9000/13484 batches | lr 4.51 | ms/batch 115.21 | loss 5.21 | ppl 182.88\n", + "| epoch 3 | 9200/13484 batches | lr 4.51 | ms/batch 115.28 | loss 5.21 | ppl 183.47\n", + "| epoch 3 | 9400/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.22 | ppl 184.25\n", + "| epoch 3 | 9600/13484 batches | lr 4.51 | ms/batch 115.27 | loss 5.18 | ppl 177.24\n", + "| epoch 3 | 9800/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.16 | ppl 174.24\n", + "| epoch 3 | 10000/13484 batches | lr 4.51 | ms/batch 115.22 | loss 5.21 | ppl 182.29\n", + "| epoch 3 | 10200/13484 batches | lr 4.51 | ms/batch 115.13 | loss 5.17 | ppl 175.34\n", + "| epoch 3 | 10400/13484 batches | lr 4.51 | ms/batch 115.28 | loss 5.14 | ppl 170.79\n", + "| epoch 3 | 10600/13484 batches | lr 4.51 | ms/batch 115.20 | loss 5.17 | ppl 176.55\n", + "| epoch 3 | 10800/13484 batches | lr 4.51 | ms/batch 115.24 | loss 5.22 | ppl 185.77\n", + "| epoch 3 | 11000/13484 batches | lr 4.51 | ms/batch 115.23 | loss 5.19 | ppl 179.38\n", + "| epoch 3 | 11200/13484 batches | lr 4.51 | ms/batch 115.30 | loss 5.19 | ppl 179.59\n", + "| epoch 3 | 11400/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.14 | ppl 171.38\n", + "| epoch 3 | 11600/13484 batches | lr 4.51 | ms/batch 115.21 | loss 5.18 | ppl 178.51\n", + "| epoch 3 | 11800/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.16 | ppl 174.52\n", + "| epoch 3 | 12000/13484 batches | lr 4.51 | ms/batch 115.25 | loss 5.20 | ppl 181.28\n", + "| epoch 3 | 12200/13484 batches | lr 4.51 | ms/batch 115.13 | loss 5.14 | ppl 170.89\n", + "| epoch 3 | 12400/13484 batches | lr 4.51 | ms/batch 115.24 | loss 5.14 | ppl 169.88\n", + "| epoch 3 | 12600/13484 batches | lr 4.51 | ms/batch 115.20 | loss 5.15 | ppl 172.67\n", + "| epoch 3 | 12800/13484 batches | lr 4.51 | ms/batch 115.19 | loss 5.18 | ppl 176.89\n", + "| epoch 3 | 13000/13484 batches | lr 4.51 | ms/batch 115.23 | loss 5.19 | ppl 179.90\n", + "| epoch 3 | 13200/13484 batches | lr 4.51 | ms/batch 115.29 | loss 5.17 | ppl 175.90\n", + "| epoch 3 | 13400/13484 batches | lr 4.51 | ms/batch 115.21 | loss 5.20 | ppl 182.07\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 3 | time: 1625.86s | valid loss 5.19 | valid ppl 178.94\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 4 | 200/13484 batches | lr 4.29 | ms/batch 115.82 | loss 5.22 | ppl 184.03\n", + "| epoch 4 | 400/13484 batches | lr 4.29 | ms/batch 115.24 | loss 5.16 | ppl 174.84\n", + "| epoch 4 | 600/13484 batches | lr 4.29 | ms/batch 115.21 | loss 5.14 | ppl 170.18\n", + "| epoch 4 | 800/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.15 | ppl 171.64\n", + "| epoch 4 | 1000/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.14 | ppl 171.26\n", + "| epoch 4 | 1200/13484 batches | lr 4.29 | ms/batch 115.14 | loss 5.14 | ppl 171.18\n", + "| epoch 4 | 1400/13484 batches | lr 4.29 | ms/batch 115.13 | loss 5.12 | ppl 166.55\n", + "| epoch 4 | 1600/13484 batches | lr 4.29 | ms/batch 115.26 | loss 5.17 | ppl 176.35\n", + "| epoch 4 | 1800/13484 batches | lr 4.29 | ms/batch 115.21 | loss 5.15 | ppl 172.34\n", + "| epoch 4 | 2000/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.13 | ppl 169.46\n", + "| epoch 4 | 2200/13484 batches | lr 4.29 | ms/batch 115.23 | loss 5.16 | ppl 173.74\n", + "| epoch 4 | 2400/13484 batches | lr 4.29 | ms/batch 115.14 | loss 5.14 | ppl 170.76\n", + "| epoch 4 | 2600/13484 batches | lr 4.29 | ms/batch 115.21 | loss 5.11 | ppl 165.36\n", + "| epoch 4 | 2800/13484 batches | lr 4.29 | ms/batch 115.14 | loss 5.15 | ppl 173.15\n", + "| epoch 4 | 3000/13484 batches | lr 4.29 | ms/batch 115.15 | loss 5.14 | ppl 171.39\n", + "| epoch 4 | 3200/13484 batches | lr 4.29 | ms/batch 115.27 | loss 5.10 | ppl 164.27\n", + "| epoch 4 | 3400/13484 batches | lr 4.29 | ms/batch 115.21 | loss 5.16 | ppl 174.64\n", + "| epoch 4 | 3600/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.13 | ppl 168.98\n", + "| epoch 4 | 3800/13484 batches | lr 4.29 | ms/batch 115.25 | loss 5.12 | ppl 167.42\n", + "| epoch 4 | 4000/13484 batches | lr 4.29 | ms/batch 115.26 | loss 5.08 | ppl 161.02\n", + "| epoch 4 | 4200/13484 batches | lr 4.29 | ms/batch 115.25 | loss 5.11 | ppl 165.33\n", + "| epoch 4 | 4400/13484 batches | lr 4.29 | ms/batch 115.26 | loss 5.11 | ppl 165.61\n", + "| epoch 4 | 4600/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.16 | ppl 173.90\n", + "| epoch 4 | 4800/13484 batches | lr 4.29 | ms/batch 115.27 | loss 5.15 | ppl 172.81\n", + "| epoch 4 | 5000/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.14 | ppl 169.98\n", + "| epoch 4 | 5200/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.13 | ppl 168.94\n", + "| epoch 4 | 5400/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.10 | ppl 164.28\n", + "| epoch 4 | 5600/13484 batches | lr 4.29 | ms/batch 115.25 | loss 5.12 | ppl 167.23\n", + "| epoch 4 | 5800/13484 batches | lr 4.29 | ms/batch 115.32 | loss 5.12 | ppl 167.63\n", + "| epoch 4 | 6000/13484 batches | lr 4.29 | ms/batch 115.34 | loss 5.14 | ppl 170.26\n", + "| epoch 4 | 6200/13484 batches | lr 4.29 | ms/batch 115.31 | loss 5.18 | ppl 177.13\n", + "| epoch 4 | 6400/13484 batches | lr 4.29 | ms/batch 115.27 | loss 5.13 | ppl 169.45\n", + "| epoch 4 | 6600/13484 batches | lr 4.29 | ms/batch 115.26 | loss 5.16 | ppl 174.83\n", + "| epoch 4 | 6800/13484 batches | lr 4.29 | ms/batch 115.32 | loss 5.11 | ppl 165.20\n", + "| epoch 4 | 7000/13484 batches | lr 4.29 | ms/batch 115.19 | loss 5.16 | ppl 174.72\n", + "| epoch 4 | 7200/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.12 | ppl 167.83\n", + "| epoch 4 | 7400/13484 batches | lr 4.29 | ms/batch 115.29 | loss 5.12 | ppl 167.13\n", + "| epoch 4 | 7600/13484 batches | lr 4.29 | ms/batch 115.33 | loss 5.13 | ppl 168.29\n", + "| epoch 4 | 7800/13484 batches | lr 4.29 | ms/batch 115.30 | loss 5.12 | ppl 167.88\n", + "| epoch 4 | 8000/13484 batches | lr 4.29 | ms/batch 115.20 | loss 5.11 | ppl 165.65\n", + "| epoch 4 | 8200/13484 batches | lr 4.29 | ms/batch 115.32 | loss 5.10 | ppl 164.16\n", + "| epoch 4 | 8400/13484 batches | lr 4.29 | ms/batch 115.29 | loss 5.12 | ppl 166.71\n", + "| epoch 4 | 8600/13484 batches | lr 4.29 | ms/batch 115.32 | loss 5.14 | ppl 169.91\n", + "| epoch 4 | 8800/13484 batches | lr 4.29 | ms/batch 115.30 | loss 5.11 | ppl 166.00\n", + "| epoch 4 | 9000/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.13 | ppl 169.67\n", + "| epoch 4 | 9200/13484 batches | lr 4.29 | ms/batch 115.31 | loss 5.13 | ppl 169.46\n", + "| epoch 4 | 9400/13484 batches | lr 4.29 | ms/batch 115.33 | loss 5.15 | ppl 171.85\n", + "| epoch 4 | 9600/13484 batches | lr 4.29 | ms/batch 115.29 | loss 5.11 | ppl 165.01\n", + "| epoch 4 | 9800/13484 batches | lr 4.29 | ms/batch 115.21 | loss 5.09 | ppl 162.51\n", + "| epoch 4 | 10000/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.12 | ppl 167.81\n", + "| epoch 4 | 10200/13484 batches | lr 4.29 | ms/batch 115.33 | loss 5.10 | ppl 163.43\n", + "| epoch 4 | 10400/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.07 | ppl 158.79\n", + "| epoch 4 | 10600/13484 batches | lr 4.29 | ms/batch 115.30 | loss 5.10 | ppl 163.54\n", + "| epoch 4 | 10800/13484 batches | lr 4.29 | ms/batch 115.39 | loss 5.12 | ppl 167.03\n", + "| epoch 4 | 11000/13484 batches | lr 4.29 | ms/batch 115.33 | loss 5.11 | ppl 166.04\n", + "| epoch 4 | 11200/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.11 | ppl 165.67\n", + "| epoch 4 | 11400/13484 batches | lr 4.29 | ms/batch 115.26 | loss 5.06 | ppl 157.42\n", + "| epoch 4 | 11600/13484 batches | lr 4.29 | ms/batch 115.25 | loss 5.10 | ppl 164.17\n", + "| epoch 4 | 11800/13484 batches | lr 4.29 | ms/batch 115.36 | loss 5.07 | ppl 159.41\n", + "| epoch 4 | 12000/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.13 | ppl 168.33\n", + "| epoch 4 | 12200/13484 batches | lr 4.29 | ms/batch 115.24 | loss 5.05 | ppl 155.52\n", + "| epoch 4 | 12400/13484 batches | lr 4.29 | ms/batch 115.25 | loss 5.07 | ppl 159.62\n", + "| epoch 4 | 12600/13484 batches | lr 4.29 | ms/batch 115.24 | loss 5.09 | ppl 161.65\n", + "| epoch 4 | 12800/13484 batches | lr 4.29 | ms/batch 115.22 | loss 5.10 | ppl 164.49\n", + "| epoch 4 | 13000/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.10 | ppl 163.47\n", + "| epoch 4 | 13200/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.09 | ppl 162.89\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 16 | 2000/ 3181 batches | lr 2.32 | ms/batch 61.40 | loss 4.17 | ppl 64.69\n", - "| epoch 16 | 2200/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.15 | ppl 63.70\n", - "| epoch 16 | 2400/ 3181 batches | lr 2.32 | ms/batch 61.36 | loss 4.17 | ppl 64.52\n", - "| epoch 16 | 2600/ 3181 batches | lr 2.32 | ms/batch 61.41 | loss 4.11 | ppl 60.85\n", - "| epoch 16 | 2800/ 3181 batches | lr 2.32 | ms/batch 61.35 | loss 4.19 | ppl 66.21\n", - "| epoch 16 | 3000/ 3181 batches | lr 2.32 | ms/batch 61.39 | loss 4.09 | ppl 59.76\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 16 | time: 206.63s | valid loss 5.50 | valid ppl 243.52\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 17 | 200/ 3181 batches | lr 2.20 | ms/batch 61.74 | loss 4.21 | ppl 67.46\n", - "| epoch 17 | 400/ 3181 batches | lr 2.20 | ms/batch 61.43 | loss 4.14 | ppl 62.91\n", - "| epoch 17 | 600/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 61.89\n", - "| epoch 17 | 800/ 3181 batches | lr 2.20 | ms/batch 61.44 | loss 4.15 | ppl 63.38\n", - "| epoch 17 | 1000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.17 | ppl 64.88\n", - "| epoch 17 | 1200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.13 | ppl 62.19\n", - "| epoch 17 | 1400/ 3181 batches | lr 2.20 | ms/batch 61.38 | loss 4.15 | ppl 63.41\n", - "| epoch 17 | 1600/ 3181 batches | lr 2.20 | ms/batch 61.37 | loss 4.13 | ppl 62.14\n", - "| epoch 17 | 1800/ 3181 batches | lr 2.20 | ms/batch 61.40 | loss 4.15 | ppl 63.28\n", - "| epoch 17 | 2000/ 3181 batches | lr 2.20 | ms/batch 61.34 | loss 4.12 | ppl 61.53\n", - "| epoch 17 | 2200/ 3181 batches | lr 2.20 | ms/batch 61.35 | loss 4.10 | ppl 60.52\n", - "| epoch 17 | 2400/ 3181 batches | lr 2.20 | ms/batch 61.45 | loss 4.11 | ppl 61.21\n", - "| epoch 17 | 2600/ 3181 batches | lr 2.20 | ms/batch 61.33 | loss 4.06 | ppl 58.01\n", - "| epoch 17 | 2800/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.14 | ppl 62.99\n", - "| epoch 17 | 3000/ 3181 batches | lr 2.20 | ms/batch 61.36 | loss 4.04 | ppl 56.98\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 17 | time: 206.66s | valid loss 5.51 | valid ppl 245.93\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 18 | 200/ 3181 batches | lr 2.09 | ms/batch 61.64 | loss 4.16 | ppl 64.27\n", - "| epoch 18 | 400/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.09 | ppl 59.95\n", - "| epoch 18 | 600/ 3181 batches | lr 2.09 | ms/batch 61.38 | loss 4.08 | ppl 58.99\n", - "| epoch 18 | 800/ 3181 batches | lr 2.09 | ms/batch 61.31 | loss 4.10 | ppl 60.18\n", - "| epoch 18 | 1000/ 3181 batches | lr 2.09 | ms/batch 61.37 | loss 4.12 | ppl 61.79\n", - "| epoch 18 | 1200/ 3181 batches | lr 2.09 | ms/batch 61.42 | loss 4.08 | ppl 58.92\n", - "| epoch 18 | 1400/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.10 | ppl 60.40\n", - "| epoch 18 | 1600/ 3181 batches | lr 2.09 | ms/batch 61.43 | loss 4.08 | ppl 59.34\n", - "| epoch 18 | 1800/ 3181 batches | lr 2.09 | ms/batch 61.36 | loss 4.09 | ppl 59.74\n", - "| epoch 18 | 2000/ 3181 batches | lr 2.09 | ms/batch 61.39 | loss 4.07 | ppl 58.43\n", - "| epoch 18 | 2200/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.06 | ppl 58.17\n", - "| epoch 18 | 2400/ 3181 batches | lr 2.09 | ms/batch 61.32 | loss 4.07 | ppl 58.27\n", - "| epoch 18 | 2600/ 3181 batches | lr 2.09 | ms/batch 61.35 | loss 4.01 | ppl 55.01\n", - "| epoch 18 | 2800/ 3181 batches | lr 2.09 | ms/batch 61.40 | loss 4.09 | ppl 59.91\n", - "| epoch 18 | 3000/ 3181 batches | lr 2.09 | ms/batch 61.34 | loss 4.00 | ppl 54.82\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 18 | time: 206.65s | valid loss 5.52 | valid ppl 248.66\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 19 | 200/ 3181 batches | lr 1.99 | ms/batch 61.63 | loss 4.12 | ppl 61.26\n", - "| epoch 19 | 400/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.04 | ppl 57.10\n", - "| epoch 19 | 600/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.18\n", - "| epoch 19 | 800/ 3181 batches | lr 1.99 | ms/batch 61.36 | loss 4.06 | ppl 57.74\n", - "| epoch 19 | 1000/ 3181 batches | lr 1.99 | ms/batch 61.49 | loss 4.08 | ppl 59.10\n", - "| epoch 19 | 1200/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.03 | ppl 56.27\n", - "| epoch 19 | 1400/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.06 | ppl 57.99\n", - "| epoch 19 | 1600/ 3181 batches | lr 1.99 | ms/batch 61.40 | loss 4.04 | ppl 56.78\n", - "| epoch 19 | 1800/ 3181 batches | lr 1.99 | ms/batch 61.39 | loss 4.05 | ppl 57.32\n", - "| epoch 19 | 2000/ 3181 batches | lr 1.99 | ms/batch 61.43 | loss 4.03 | ppl 56.16\n", - "| epoch 19 | 2200/ 3181 batches | lr 1.99 | ms/batch 61.34 | loss 4.02 | ppl 55.62\n", - "| epoch 19 | 2400/ 3181 batches | lr 1.99 | ms/batch 61.42 | loss 4.02 | ppl 55.68\n", - "| epoch 19 | 2600/ 3181 batches | lr 1.99 | ms/batch 61.38 | loss 3.97 | ppl 52.86\n", - "| epoch 19 | 2800/ 3181 batches | lr 1.99 | ms/batch 61.33 | loss 4.05 | ppl 57.12\n", - "| epoch 19 | 3000/ 3181 batches | lr 1.99 | ms/batch 61.31 | loss 3.95 | ppl 52.08\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 19 | time: 206.62s | valid loss 5.55 | valid ppl 257.12\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 20 | 200/ 3181 batches | lr 1.89 | ms/batch 61.70 | loss 4.07 | ppl 58.59\n", - "| epoch 20 | 400/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 4.01 | ppl 55.07\n", - "| epoch 20 | 600/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 3.99 | ppl 53.82\n", - "| epoch 20 | 800/ 3181 batches | lr 1.89 | ms/batch 61.40 | loss 4.01 | ppl 55.29\n", - "| epoch 20 | 1000/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.04 | ppl 56.83\n", - "| epoch 20 | 1200/ 3181 batches | lr 1.89 | ms/batch 61.34 | loss 3.99 | ppl 54.01\n", - "| epoch 20 | 1400/ 3181 batches | lr 1.89 | ms/batch 61.35 | loss 4.02 | ppl 55.48\n", - "| epoch 20 | 1600/ 3181 batches | lr 1.89 | ms/batch 61.33 | loss 4.00 | ppl 54.51\n", - "| epoch 20 | 1800/ 3181 batches | lr 1.89 | ms/batch 61.41 | loss 4.01 | ppl 55.02\n", - "| epoch 20 | 2000/ 3181 batches | lr 1.89 | ms/batch 61.38 | loss 3.99 | ppl 54.00\n", - "| epoch 20 | 2200/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.97 | ppl 53.23\n", - "| epoch 20 | 2400/ 3181 batches | lr 1.89 | ms/batch 61.29 | loss 3.98 | ppl 53.61\n", - "| epoch 20 | 2600/ 3181 batches | lr 1.89 | ms/batch 61.30 | loss 3.92 | ppl 50.62\n", - "| epoch 20 | 2800/ 3181 batches | lr 1.89 | ms/batch 61.32 | loss 4.01 | ppl 55.04\n", - "| epoch 20 | 3000/ 3181 batches | lr 1.89 | ms/batch 61.39 | loss 3.92 | ppl 50.18\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 20 | time: 206.60s | valid loss 5.61 | valid ppl 273.93\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 21 | 200/ 3181 batches | lr 1.79 | ms/batch 61.65 | loss 4.03 | ppl 56.37\n", - "| epoch 21 | 400/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.96 | ppl 52.65\n", - "| epoch 21 | 600/ 3181 batches | lr 1.79 | ms/batch 61.43 | loss 3.94 | ppl 51.53\n", - "| epoch 21 | 800/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.97 | ppl 52.82\n", - "| epoch 21 | 1000/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.99 | ppl 54.28\n", - "| epoch 21 | 1200/ 3181 batches | lr 1.79 | ms/batch 61.31 | loss 3.95 | ppl 51.85\n", - "| epoch 21 | 1400/ 3181 batches | lr 1.79 | ms/batch 61.33 | loss 3.98 | ppl 53.51\n", - "| epoch 21 | 1600/ 3181 batches | lr 1.79 | ms/batch 61.37 | loss 3.96 | ppl 52.23\n", - "| epoch 21 | 1800/ 3181 batches | lr 1.79 | ms/batch 61.42 | loss 3.97 | ppl 52.95\n", - "| epoch 21 | 2000/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.95 | ppl 51.71\n", - "| epoch 21 | 2200/ 3181 batches | lr 1.79 | ms/batch 61.38 | loss 3.94 | ppl 51.19\n", - "| epoch 21 | 2400/ 3181 batches | lr 1.79 | ms/batch 61.34 | loss 3.94 | ppl 51.57\n" + "| epoch 4 | 13400/13484 batches | lr 4.29 | ms/batch 115.28 | loss 5.12 | ppl 166.66\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 4 | time: 1626.36s | valid loss 5.13 | valid ppl 168.54\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 5 | 200/13484 batches | lr 4.07 | ms/batch 115.90 | loss 5.14 | ppl 170.65\n", + "| epoch 5 | 400/13484 batches | lr 4.07 | ms/batch 115.23 | loss 5.09 | ppl 163.18\n", + "| epoch 5 | 600/13484 batches | lr 4.07 | ms/batch 115.19 | loss 5.07 | ppl 159.22\n", + "| epoch 5 | 800/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.08 | ppl 160.60\n", + "| epoch 5 | 1000/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.08 | ppl 160.49\n", + "| epoch 5 | 1200/13484 batches | lr 4.07 | ms/batch 115.15 | loss 5.07 | ppl 158.86\n", + "| epoch 5 | 1400/13484 batches | lr 4.07 | ms/batch 115.14 | loss 5.06 | ppl 156.88\n", + "| epoch 5 | 1600/13484 batches | lr 4.07 | ms/batch 115.15 | loss 5.10 | ppl 164.68\n", + "| epoch 5 | 1800/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.09 | ppl 161.68\n", + "| epoch 5 | 2000/13484 batches | lr 4.07 | ms/batch 115.19 | loss 5.05 | ppl 156.19\n", + "| epoch 5 | 2200/13484 batches | lr 4.07 | ms/batch 115.16 | loss 5.06 | ppl 157.65\n", + "| epoch 5 | 2400/13484 batches | lr 4.07 | ms/batch 115.23 | loss 5.05 | ppl 156.29\n", + "| epoch 5 | 2600/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.04 | ppl 155.08\n", + "| epoch 5 | 2800/13484 batches | lr 4.07 | ms/batch 115.12 | loss 5.08 | ppl 160.79\n", + "| epoch 5 | 3000/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.06 | ppl 157.93\n", + "| epoch 5 | 3200/13484 batches | lr 4.07 | ms/batch 115.19 | loss 5.03 | ppl 153.59\n", + "| epoch 5 | 3400/13484 batches | lr 4.07 | ms/batch 115.24 | loss 5.10 | ppl 164.69\n", + "| epoch 5 | 3600/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.07 | ppl 159.67\n", + "| epoch 5 | 3800/13484 batches | lr 4.07 | ms/batch 115.23 | loss 5.05 | ppl 156.33\n", + "| epoch 5 | 4000/13484 batches | lr 4.07 | ms/batch 115.30 | loss 5.00 | ppl 148.52\n", + "| epoch 5 | 4200/13484 batches | lr 4.07 | ms/batch 115.16 | loss 5.03 | ppl 153.04\n", + "| epoch 5 | 4400/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.04 | ppl 155.12\n", + "| epoch 5 | 4600/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.09 | ppl 162.86\n", + "| epoch 5 | 4800/13484 batches | lr 4.07 | ms/batch 115.21 | loss 5.07 | ppl 159.17\n", + "| epoch 5 | 5000/13484 batches | lr 4.07 | ms/batch 115.27 | loss 5.06 | ppl 157.50\n", + "| epoch 5 | 5200/13484 batches | lr 4.07 | ms/batch 115.15 | loss 5.06 | ppl 157.70\n", + "| epoch 5 | 5400/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.04 | ppl 154.31\n", + "| epoch 5 | 5600/13484 batches | lr 4.07 | ms/batch 115.18 | loss 5.05 | ppl 156.47\n", + "| epoch 5 | 5800/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.06 | ppl 157.43\n", + "| epoch 5 | 6000/13484 batches | lr 4.07 | ms/batch 115.15 | loss 5.07 | ppl 159.33\n", + "| epoch 5 | 6200/13484 batches | lr 4.07 | ms/batch 115.19 | loss 5.09 | ppl 163.19\n", + "| epoch 5 | 6400/13484 batches | lr 4.07 | ms/batch 115.19 | loss 5.07 | ppl 159.77\n", + "| epoch 5 | 6600/13484 batches | lr 4.07 | ms/batch 115.28 | loss 5.09 | ppl 163.17\n", + "| epoch 5 | 6800/13484 batches | lr 4.07 | ms/batch 115.11 | loss 5.03 | ppl 153.03\n", + "| epoch 5 | 7000/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.09 | ppl 161.90\n", + "| epoch 5 | 7200/13484 batches | lr 4.07 | ms/batch 115.21 | loss 5.06 | ppl 156.90\n", + "| epoch 5 | 7400/13484 batches | lr 4.07 | ms/batch 115.23 | loss 5.07 | ppl 159.02\n", + "| epoch 5 | 7600/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.05 | ppl 156.02\n", + "| epoch 5 | 7800/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.06 | ppl 157.20\n", + "| epoch 5 | 8000/13484 batches | lr 4.07 | ms/batch 115.20 | loss 5.04 | ppl 154.56\n", + "| epoch 5 | 8200/13484 batches | lr 4.07 | ms/batch 115.20 | loss 5.03 | ppl 152.46\n", + "| epoch 5 | 8400/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.06 | ppl 157.62\n", + "| epoch 5 | 8600/13484 batches | lr 4.07 | ms/batch 115.28 | loss 5.07 | ppl 158.74\n", + "| epoch 5 | 8800/13484 batches | lr 4.07 | ms/batch 115.30 | loss 5.04 | ppl 154.53\n", + "| epoch 5 | 9000/13484 batches | lr 4.07 | ms/batch 115.31 | loss 5.06 | ppl 157.02\n", + "| epoch 5 | 9200/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.07 | ppl 159.14\n", + "| epoch 5 | 9400/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.07 | ppl 159.15\n", + "| epoch 5 | 9600/13484 batches | lr 4.07 | ms/batch 115.24 | loss 5.04 | ppl 153.89\n", + "| epoch 5 | 9800/13484 batches | lr 4.07 | ms/batch 115.27 | loss 5.02 | ppl 151.96\n", + "| epoch 5 | 10000/13484 batches | lr 4.07 | ms/batch 115.24 | loss 5.05 | ppl 156.58\n", + "| epoch 5 | 10200/13484 batches | lr 4.07 | ms/batch 115.30 | loss 5.02 | ppl 152.10\n", + "| epoch 5 | 10400/13484 batches | lr 4.07 | ms/batch 115.33 | loss 4.98 | ppl 146.11\n", + "| epoch 5 | 10600/13484 batches | lr 4.07 | ms/batch 115.26 | loss 5.03 | ppl 153.27\n", + "| epoch 5 | 10800/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.06 | ppl 157.44\n", + "| epoch 5 | 11000/13484 batches | lr 4.07 | ms/batch 115.33 | loss 5.05 | ppl 156.34\n", + "| epoch 5 | 11200/13484 batches | lr 4.07 | ms/batch 115.22 | loss 5.04 | ppl 154.36\n", + "| epoch 5 | 11400/13484 batches | lr 4.07 | ms/batch 115.27 | loss 5.00 | ppl 148.51\n", + "| epoch 5 | 11600/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.03 | ppl 153.09\n", + "| epoch 5 | 11800/13484 batches | lr 4.07 | ms/batch 115.26 | loss 5.00 | ppl 148.85\n", + "| epoch 5 | 12000/13484 batches | lr 4.07 | ms/batch 115.25 | loss 5.06 | ppl 156.93\n", + "| epoch 5 | 12200/13484 batches | lr 4.07 | ms/batch 115.22 | loss 4.98 | ppl 145.89\n", + "| epoch 5 | 12400/13484 batches | lr 4.07 | ms/batch 115.20 | loss 5.00 | ppl 148.86\n", + "| epoch 5 | 12600/13484 batches | lr 4.07 | ms/batch 115.33 | loss 5.02 | ppl 151.32\n", + "| epoch 5 | 12800/13484 batches | lr 4.07 | ms/batch 115.23 | loss 5.04 | ppl 154.42\n", + "| epoch 5 | 13000/13484 batches | lr 4.07 | ms/batch 115.28 | loss 5.03 | ppl 152.95\n", + "| epoch 5 | 13200/13484 batches | lr 4.07 | ms/batch 115.17 | loss 5.03 | ppl 153.49\n", + "| epoch 5 | 13400/13484 batches | lr 4.07 | ms/batch 115.35 | loss 5.05 | ppl 155.92\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 5 | time: 1625.93s | valid loss 5.10 | valid ppl 164.06\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 6 | 200/13484 batches | lr 3.87 | ms/batch 115.82 | loss 5.07 | ppl 159.79\n", + "| epoch 6 | 400/13484 batches | lr 3.87 | ms/batch 115.22 | loss 5.04 | ppl 153.75\n", + "| epoch 6 | 600/13484 batches | lr 3.87 | ms/batch 115.14 | loss 5.01 | ppl 150.20\n", + "| epoch 6 | 800/13484 batches | lr 3.87 | ms/batch 115.25 | loss 5.01 | ppl 150.24\n", + "| epoch 6 | 1000/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.01 | ppl 149.42\n", + "| epoch 6 | 1200/13484 batches | lr 3.87 | ms/batch 115.09 | loss 5.01 | ppl 150.28\n", + "| epoch 6 | 1400/13484 batches | lr 3.87 | ms/batch 115.18 | loss 5.00 | ppl 148.53\n", + "| epoch 6 | 1600/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.05 | ppl 156.45\n", + "| epoch 6 | 1800/13484 batches | lr 3.87 | ms/batch 115.17 | loss 5.02 | ppl 151.97\n", + "| epoch 6 | 2000/13484 batches | lr 3.87 | ms/batch 115.14 | loss 5.00 | ppl 147.68\n", + "| epoch 6 | 2200/13484 batches | lr 3.87 | ms/batch 115.22 | loss 5.00 | ppl 148.99\n", + "| epoch 6 | 2400/13484 batches | lr 3.87 | ms/batch 115.19 | loss 5.00 | ppl 147.82\n", + "| epoch 6 | 2600/13484 batches | lr 3.87 | ms/batch 115.19 | loss 4.98 | ppl 145.20\n", + "| epoch 6 | 2800/13484 batches | lr 3.87 | ms/batch 115.20 | loss 5.02 | ppl 152.00\n", + "| epoch 6 | 3000/13484 batches | lr 3.87 | ms/batch 115.20 | loss 5.01 | ppl 149.24\n", + "| epoch 6 | 3200/13484 batches | lr 3.87 | ms/batch 115.23 | loss 4.98 | ppl 145.09\n", + "| epoch 6 | 3400/13484 batches | lr 3.87 | ms/batch 115.35 | loss 5.03 | ppl 153.68\n", + "| epoch 6 | 3600/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.01 | ppl 149.34\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 21 | 2600/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.88 | ppl 48.60\n", - "| epoch 21 | 2800/ 3181 batches | lr 1.79 | ms/batch 61.40 | loss 3.97 | ppl 52.99\n", - "| epoch 21 | 3000/ 3181 batches | lr 1.79 | ms/batch 61.32 | loss 3.87 | ppl 48.17\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 21 | time: 206.61s | valid loss 5.61 | valid ppl 273.11\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 22 | 200/ 3181 batches | lr 1.70 | ms/batch 61.70 | loss 3.99 | ppl 54.02\n", - "| epoch 22 | 400/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.92 | ppl 50.52\n", - "| epoch 22 | 600/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.90 | ppl 49.61\n", - "| epoch 22 | 800/ 3181 batches | lr 1.70 | ms/batch 61.33 | loss 3.93 | ppl 51.15\n", - "| epoch 22 | 1000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.96 | ppl 52.34\n", - "| epoch 22 | 1200/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 50.10\n", - "| epoch 22 | 1400/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.94 | ppl 51.37\n", - "| epoch 22 | 1600/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.92 | ppl 50.25\n", - "| epoch 22 | 1800/ 3181 batches | lr 1.70 | ms/batch 61.36 | loss 3.93 | ppl 50.89\n", - "| epoch 22 | 2000/ 3181 batches | lr 1.70 | ms/batch 61.30 | loss 3.91 | ppl 49.70\n", - "| epoch 22 | 2200/ 3181 batches | lr 1.70 | ms/batch 61.43 | loss 3.90 | ppl 49.28\n", - "| epoch 22 | 2400/ 3181 batches | lr 1.70 | ms/batch 61.37 | loss 3.90 | ppl 49.46\n", - "| epoch 22 | 2600/ 3181 batches | lr 1.70 | ms/batch 61.41 | loss 3.84 | ppl 46.62\n", - "| epoch 22 | 2800/ 3181 batches | lr 1.70 | ms/batch 61.38 | loss 3.93 | ppl 50.75\n", - "| epoch 22 | 3000/ 3181 batches | lr 1.70 | ms/batch 61.34 | loss 3.83 | ppl 46.27\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 22 | time: 206.60s | valid loss 5.61 | valid ppl 273.57\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 23 | 200/ 3181 batches | lr 1.62 | ms/batch 61.61 | loss 3.96 | ppl 52.31\n", - "| epoch 23 | 400/ 3181 batches | lr 1.62 | ms/batch 61.32 | loss 3.88 | ppl 48.56\n", - "| epoch 23 | 600/ 3181 batches | lr 1.62 | ms/batch 61.35 | loss 3.86 | ppl 47.70\n", - "| epoch 23 | 800/ 3181 batches | lr 1.62 | ms/batch 61.31 | loss 3.90 | ppl 49.41\n", - "| epoch 23 | 1000/ 3181 batches | lr 1.62 | ms/batch 61.41 | loss 3.92 | ppl 50.42\n", - "| epoch 23 | 1200/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.88 | ppl 48.43\n", - "| epoch 23 | 1400/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.91 | ppl 49.85\n", - "| epoch 23 | 1600/ 3181 batches | lr 1.62 | ms/batch 61.30 | loss 3.88 | ppl 48.37\n", - "| epoch 23 | 1800/ 3181 batches | lr 1.62 | ms/batch 61.34 | loss 3.89 | ppl 49.03\n", - "| epoch 23 | 2000/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.87 | ppl 48.12\n", - "| epoch 23 | 2200/ 3181 batches | lr 1.62 | ms/batch 61.36 | loss 3.86 | ppl 47.57\n", - "| epoch 23 | 2400/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.87 | ppl 47.73\n", - "| epoch 23 | 2600/ 3181 batches | lr 1.62 | ms/batch 61.29 | loss 3.81 | ppl 45.15\n", - "| epoch 23 | 2800/ 3181 batches | lr 1.62 | ms/batch 61.37 | loss 3.90 | ppl 49.58\n", - "| epoch 23 | 3000/ 3181 batches | lr 1.62 | ms/batch 61.38 | loss 3.80 | ppl 44.75\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 23 | time: 206.56s | valid loss 5.64 | valid ppl 281.95\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 24 | 200/ 3181 batches | lr 1.54 | ms/batch 61.67 | loss 3.92 | ppl 50.35\n", - "| epoch 24 | 400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.85 | ppl 47.01\n", - "| epoch 24 | 600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.84 | ppl 46.34\n", - "| epoch 24 | 800/ 3181 batches | lr 1.54 | ms/batch 61.43 | loss 3.87 | ppl 47.90\n", - "| epoch 24 | 1000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.89 | ppl 48.81\n", - "| epoch 24 | 1200/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.83\n", - "| epoch 24 | 1400/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.87 | ppl 48.14\n", - "| epoch 24 | 1600/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.85 | ppl 46.96\n", - "| epoch 24 | 1800/ 3181 batches | lr 1.54 | ms/batch 61.40 | loss 3.86 | ppl 47.49\n", - "| epoch 24 | 2000/ 3181 batches | lr 1.54 | ms/batch 61.47 | loss 3.84 | ppl 46.41\n", - "| epoch 24 | 2200/ 3181 batches | lr 1.54 | ms/batch 61.31 | loss 3.82 | ppl 45.83\n", - "| epoch 24 | 2400/ 3181 batches | lr 1.54 | ms/batch 61.35 | loss 3.83 | ppl 46.13\n", - "| epoch 24 | 2600/ 3181 batches | lr 1.54 | ms/batch 61.36 | loss 3.77 | ppl 43.56\n", - "| epoch 24 | 2800/ 3181 batches | lr 1.54 | ms/batch 61.39 | loss 3.86 | ppl 47.52\n", - "| epoch 24 | 3000/ 3181 batches | lr 1.54 | ms/batch 61.29 | loss 3.77 | ppl 43.23\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 24 | time: 206.67s | valid loss 5.67 | valid ppl 290.25\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 25 | 200/ 3181 batches | lr 1.46 | ms/batch 61.66 | loss 3.89 | ppl 48.76\n", - "| epoch 25 | 400/ 3181 batches | lr 1.46 | ms/batch 61.41 | loss 3.82 | ppl 45.61\n", - "| epoch 25 | 600/ 3181 batches | lr 1.46 | ms/batch 61.44 | loss 3.80 | ppl 44.79\n", - "| epoch 25 | 800/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.83 | ppl 46.26\n", - "| epoch 25 | 1000/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.86 | ppl 47.26\n", - "| epoch 25 | 1200/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.81 | ppl 45.19\n", - "| epoch 25 | 1400/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.84 | ppl 46.37\n", - "| epoch 25 | 1600/ 3181 batches | lr 1.46 | ms/batch 61.36 | loss 3.82 | ppl 45.47\n", - "| epoch 25 | 1800/ 3181 batches | lr 1.46 | ms/batch 61.38 | loss 3.83 | ppl 45.88\n", - "| epoch 25 | 2000/ 3181 batches | lr 1.46 | ms/batch 61.35 | loss 3.81 | ppl 45.08\n", - "| epoch 25 | 2200/ 3181 batches | lr 1.46 | ms/batch 61.43 | loss 3.80 | ppl 44.56\n", - "| epoch 25 | 2400/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.80 | ppl 44.78\n", - "| epoch 25 | 2600/ 3181 batches | lr 1.46 | ms/batch 61.34 | loss 3.74 | ppl 42.12\n", - "| epoch 25 | 2800/ 3181 batches | lr 1.46 | ms/batch 61.31 | loss 3.83 | ppl 45.90\n", - "| epoch 25 | 3000/ 3181 batches | lr 1.46 | ms/batch 61.37 | loss 3.74 | ppl 42.12\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 25 | time: 206.63s | valid loss 5.65 | valid ppl 283.82\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 26 | 200/ 3181 batches | lr 1.39 | ms/batch 61.69 | loss 3.86 | ppl 47.46\n", - "| epoch 26 | 400/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.79 | ppl 44.15\n", - "| epoch 26 | 600/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.77 | ppl 43.51\n", - "| epoch 26 | 800/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.08\n", - "| epoch 26 | 1000/ 3181 batches | lr 1.39 | ms/batch 61.42 | loss 3.82 | ppl 45.75\n", - "| epoch 26 | 1200/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.78 | ppl 43.98\n", - "| epoch 26 | 1400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.81 | ppl 45.28\n", - "| epoch 26 | 1600/ 3181 batches | lr 1.39 | ms/batch 61.28 | loss 3.78 | ppl 43.92\n", - "| epoch 26 | 1800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.57\n", - "| epoch 26 | 2000/ 3181 batches | lr 1.39 | ms/batch 61.38 | loss 3.77 | ppl 43.55\n", - "| epoch 26 | 2200/ 3181 batches | lr 1.39 | ms/batch 61.44 | loss 3.77 | ppl 43.27\n", - "| epoch 26 | 2400/ 3181 batches | lr 1.39 | ms/batch 61.32 | loss 3.77 | ppl 43.43\n", - "| epoch 26 | 2600/ 3181 batches | lr 1.39 | ms/batch 61.41 | loss 3.71 | ppl 40.92\n", - "| epoch 26 | 2800/ 3181 batches | lr 1.39 | ms/batch 61.39 | loss 3.80 | ppl 44.73\n", - "| epoch 26 | 3000/ 3181 batches | lr 1.39 | ms/batch 61.40 | loss 3.71 | ppl 40.74\n" + "| epoch 6 | 3800/13484 batches | lr 3.87 | ms/batch 115.20 | loss 5.00 | ppl 148.07\n", + "| epoch 6 | 4000/13484 batches | lr 3.87 | ms/batch 115.32 | loss 4.94 | ppl 140.04\n", + "| epoch 6 | 4200/13484 batches | lr 3.87 | ms/batch 115.21 | loss 4.97 | ppl 144.64\n", + "| epoch 6 | 4400/13484 batches | lr 3.87 | ms/batch 115.20 | loss 4.99 | ppl 146.48\n", + "| epoch 6 | 4600/13484 batches | lr 3.87 | ms/batch 115.18 | loss 5.03 | ppl 153.49\n", + "| epoch 6 | 4800/13484 batches | lr 3.87 | ms/batch 115.30 | loss 5.01 | ppl 150.20\n", + "| epoch 6 | 5000/13484 batches | lr 3.87 | ms/batch 115.24 | loss 5.00 | ppl 148.23\n", + "| epoch 6 | 5200/13484 batches | lr 3.87 | ms/batch 115.22 | loss 5.00 | ppl 148.51\n", + "| epoch 6 | 5400/13484 batches | lr 3.87 | ms/batch 115.31 | loss 4.98 | ppl 145.45\n", + "| epoch 6 | 5600/13484 batches | lr 3.87 | ms/batch 115.25 | loss 4.99 | ppl 146.84\n", + "| epoch 6 | 5800/13484 batches | lr 3.87 | ms/batch 115.29 | loss 4.99 | ppl 147.24\n", + "| epoch 6 | 6000/13484 batches | lr 3.87 | ms/batch 115.19 | loss 5.01 | ppl 150.09\n", + "| epoch 6 | 6200/13484 batches | lr 3.87 | ms/batch 115.21 | loss 5.03 | ppl 152.86\n", + "| epoch 6 | 6400/13484 batches | lr 3.87 | ms/batch 115.17 | loss 5.02 | ppl 150.83\n", + "| epoch 6 | 6600/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.05 | ppl 155.25\n", + "| epoch 6 | 6800/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.00 | ppl 148.10\n", + "| epoch 6 | 7000/13484 batches | lr 3.87 | ms/batch 115.35 | loss 5.03 | ppl 152.52\n", + "| epoch 6 | 7200/13484 batches | lr 3.87 | ms/batch 115.25 | loss 5.00 | ppl 148.62\n", + "| epoch 6 | 7400/13484 batches | lr 3.87 | ms/batch 115.30 | loss 5.00 | ppl 148.56\n", + "| epoch 6 | 7600/13484 batches | lr 3.87 | ms/batch 115.25 | loss 4.99 | ppl 147.28\n", + "| epoch 6 | 7800/13484 batches | lr 3.87 | ms/batch 115.24 | loss 5.00 | ppl 147.93\n", + "| epoch 6 | 8000/13484 batches | lr 3.87 | ms/batch 115.24 | loss 4.98 | ppl 145.76\n", + "| epoch 6 | 8200/13484 batches | lr 3.87 | ms/batch 115.31 | loss 4.97 | ppl 143.39\n", + "| epoch 6 | 8400/13484 batches | lr 3.87 | ms/batch 115.24 | loss 4.99 | ppl 147.14\n", + "| epoch 6 | 8600/13484 batches | lr 3.87 | ms/batch 115.27 | loss 5.00 | ppl 148.00\n", + "| epoch 6 | 8800/13484 batches | lr 3.87 | ms/batch 115.35 | loss 4.98 | ppl 145.27\n", + "| epoch 6 | 9000/13484 batches | lr 3.87 | ms/batch 115.27 | loss 5.01 | ppl 150.06\n", + "| epoch 6 | 9200/13484 batches | lr 3.87 | ms/batch 115.21 | loss 5.01 | ppl 150.09\n", + "| epoch 6 | 9400/13484 batches | lr 3.87 | ms/batch 115.28 | loss 5.01 | ppl 150.08\n", + "| epoch 6 | 9600/13484 batches | lr 3.87 | ms/batch 115.16 | loss 4.99 | ppl 147.55\n", + "| epoch 6 | 9800/13484 batches | lr 3.87 | ms/batch 115.27 | loss 4.97 | ppl 143.67\n", + "| epoch 6 | 10000/13484 batches | lr 3.87 | ms/batch 115.20 | loss 4.99 | ppl 147.66\n", + "| epoch 6 | 10200/13484 batches | lr 3.87 | ms/batch 115.31 | loss 4.95 | ppl 141.61\n", + "| epoch 6 | 10400/13484 batches | lr 3.87 | ms/batch 115.20 | loss 4.93 | ppl 138.76\n", + "| epoch 6 | 10600/13484 batches | lr 3.87 | ms/batch 115.28 | loss 4.97 | ppl 144.59\n", + "| epoch 6 | 10800/13484 batches | lr 3.87 | ms/batch 115.23 | loss 5.01 | ppl 149.16\n", + "| epoch 6 | 11000/13484 batches | lr 3.87 | ms/batch 115.29 | loss 5.00 | ppl 148.35\n", + "| epoch 6 | 11200/13484 batches | lr 3.87 | ms/batch 115.29 | loss 5.01 | ppl 149.31\n", + "| epoch 6 | 11400/13484 batches | lr 3.87 | ms/batch 115.29 | loss 4.95 | ppl 141.26\n", + "| epoch 6 | 11600/13484 batches | lr 3.87 | ms/batch 115.34 | loss 4.98 | ppl 145.07\n", + "| epoch 6 | 11800/13484 batches | lr 3.87 | ms/batch 115.28 | loss 4.94 | ppl 140.00\n", + "| epoch 6 | 12000/13484 batches | lr 3.87 | ms/batch 115.19 | loss 5.00 | ppl 147.85\n", + "| epoch 6 | 12200/13484 batches | lr 3.87 | ms/batch 115.31 | loss 4.93 | ppl 137.74\n", + "| epoch 6 | 12400/13484 batches | lr 3.87 | ms/batch 115.26 | loss 4.95 | ppl 140.89\n", + "| epoch 6 | 12600/13484 batches | lr 3.87 | ms/batch 115.38 | loss 4.97 | ppl 143.33\n", + "| epoch 6 | 12800/13484 batches | lr 3.87 | ms/batch 115.29 | loss 4.98 | ppl 145.29\n", + "| epoch 6 | 13000/13484 batches | lr 3.87 | ms/batch 115.37 | loss 4.97 | ppl 144.45\n", + "| epoch 6 | 13200/13484 batches | lr 3.87 | ms/batch 115.31 | loss 4.98 | ppl 146.13\n", + "| epoch 6 | 13400/13484 batches | lr 3.87 | ms/batch 115.33 | loss 5.00 | ppl 148.36\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 6 | time: 1626.41s | valid loss 5.09 | valid ppl 162.11\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 7 | 200/13484 batches | lr 3.68 | ms/batch 115.82 | loss 5.02 | ppl 151.17\n", + "| epoch 7 | 400/13484 batches | lr 3.68 | ms/batch 115.20 | loss 4.98 | ppl 144.87\n", + "| epoch 7 | 600/13484 batches | lr 3.68 | ms/batch 115.23 | loss 4.96 | ppl 142.39\n", + "| epoch 7 | 800/13484 batches | lr 3.68 | ms/batch 115.16 | loss 4.96 | ppl 142.45\n", + "| epoch 7 | 1000/13484 batches | lr 3.68 | ms/batch 115.08 | loss 4.96 | ppl 142.21\n", + "| epoch 7 | 1200/13484 batches | lr 3.68 | ms/batch 115.19 | loss 4.96 | ppl 142.21\n", + "| epoch 7 | 1400/13484 batches | lr 3.68 | ms/batch 115.23 | loss 4.94 | ppl 139.97\n", + "| epoch 7 | 1600/13484 batches | lr 3.68 | ms/batch 115.13 | loss 4.99 | ppl 146.87\n", + "| epoch 7 | 1800/13484 batches | lr 3.68 | ms/batch 115.11 | loss 4.97 | ppl 144.27\n", + "| epoch 7 | 2000/13484 batches | lr 3.68 | ms/batch 115.14 | loss 4.94 | ppl 139.63\n", + "| epoch 7 | 2200/13484 batches | lr 3.68 | ms/batch 115.13 | loss 4.94 | ppl 140.28\n", + "| epoch 7 | 2400/13484 batches | lr 3.68 | ms/batch 115.14 | loss 4.94 | ppl 140.42\n", + "| epoch 7 | 2600/13484 batches | lr 3.68 | ms/batch 115.20 | loss 4.93 | ppl 138.37\n", + "| epoch 7 | 2800/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.97 | ppl 144.51\n", + "| epoch 7 | 3000/13484 batches | lr 3.68 | ms/batch 115.22 | loss 4.95 | ppl 141.43\n", + "| epoch 7 | 3200/13484 batches | lr 3.68 | ms/batch 115.17 | loss 4.92 | ppl 137.29\n", + "| epoch 7 | 3400/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.98 | ppl 145.62\n", + "| epoch 7 | 3600/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.95 | ppl 141.60\n", + "| epoch 7 | 3800/13484 batches | lr 3.68 | ms/batch 115.21 | loss 4.94 | ppl 139.88\n", + "| epoch 7 | 4000/13484 batches | lr 3.68 | ms/batch 115.19 | loss 4.89 | ppl 133.49\n", + "| epoch 7 | 4200/13484 batches | lr 3.68 | ms/batch 115.17 | loss 4.93 | ppl 138.21\n", + "| epoch 7 | 4400/13484 batches | lr 3.68 | ms/batch 115.23 | loss 4.94 | ppl 139.14\n", + "| epoch 7 | 4600/13484 batches | lr 3.68 | ms/batch 115.28 | loss 4.98 | ppl 145.67\n", + "| epoch 7 | 4800/13484 batches | lr 3.68 | ms/batch 115.25 | loss 4.96 | ppl 143.05\n", + "| epoch 7 | 5000/13484 batches | lr 3.68 | ms/batch 115.20 | loss 4.95 | ppl 141.27\n", + "| epoch 7 | 5200/13484 batches | lr 3.68 | ms/batch 115.18 | loss 4.95 | ppl 140.78\n", + "| epoch 7 | 5400/13484 batches | lr 3.68 | ms/batch 115.35 | loss 4.93 | ppl 137.98\n", + "| epoch 7 | 5600/13484 batches | lr 3.68 | ms/batch 115.29 | loss 4.94 | ppl 139.66\n", + "| epoch 7 | 5800/13484 batches | lr 3.68 | ms/batch 115.21 | loss 4.94 | ppl 139.99\n", + "| epoch 7 | 6000/13484 batches | lr 3.68 | ms/batch 115.26 | loss 4.96 | ppl 142.34\n", + "| epoch 7 | 6200/13484 batches | lr 3.68 | ms/batch 115.23 | loss 4.99 | ppl 146.32\n", + "| epoch 7 | 6400/13484 batches | lr 3.68 | ms/batch 115.18 | loss 4.96 | ppl 142.33\n", + "| epoch 7 | 6600/13484 batches | lr 3.68 | ms/batch 115.22 | loss 4.99 | ppl 146.69\n", + "| epoch 7 | 6800/13484 batches | lr 3.68 | ms/batch 115.21 | loss 4.93 | ppl 137.90\n", + "| epoch 7 | 7000/13484 batches | lr 3.68 | ms/batch 115.18 | loss 4.98 | ppl 145.72\n", + "| epoch 7 | 7200/13484 batches | lr 3.68 | ms/batch 115.25 | loss 4.94 | ppl 140.06\n", + "| epoch 7 | 7400/13484 batches | lr 3.68 | ms/batch 115.14 | loss 4.94 | ppl 140.43\n", + "| epoch 7 | 7600/13484 batches | lr 3.68 | ms/batch 115.28 | loss 4.95 | ppl 140.71\n", + "| epoch 7 | 7800/13484 batches | lr 3.68 | ms/batch 115.26 | loss 4.94 | ppl 140.23\n", + "| epoch 7 | 8000/13484 batches | lr 3.68 | ms/batch 115.21 | loss 4.93 | ppl 138.76\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 26 | time: 206.67s | valid loss 5.69 | valid ppl 294.72\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 27 | 200/ 3181 batches | lr 1.32 | ms/batch 61.61 | loss 3.83 | ppl 46.08\n", - "| epoch 27 | 400/ 3181 batches | lr 1.32 | ms/batch 61.34 | loss 3.76 | ppl 42.90\n", - "| epoch 27 | 600/ 3181 batches | lr 1.32 | ms/batch 61.37 | loss 3.73 | ppl 41.77\n", - "| epoch 27 | 800/ 3181 batches | lr 1.32 | ms/batch 61.39 | loss 3.78 | ppl 43.61\n", - "| epoch 27 | 1000/ 3181 batches | lr 1.32 | ms/batch 61.38 | loss 3.80 | ppl 44.57\n", - "| epoch 27 | 1200/ 3181 batches | lr 1.32 | ms/batch 61.31 | loss 3.75 | ppl 42.51\n", - "| epoch 27 | 1400/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.79 | ppl 44.18\n", - "| epoch 27 | 1600/ 3181 batches | lr 1.32 | ms/batch 61.30 | loss 3.76 | ppl 42.82\n", - "| epoch 27 | 1800/ 3181 batches | lr 1.32 | ms/batch 61.41 | loss 3.76 | ppl 42.95\n", - "| epoch 27 | 2000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.75 | ppl 42.42\n", - "| epoch 27 | 2200/ 3181 batches | lr 1.32 | ms/batch 61.35 | loss 3.74 | ppl 42.12\n", - "| epoch 27 | 2400/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.74 | ppl 42.31\n", - "| epoch 27 | 2600/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.68 | ppl 39.83\n", - "| epoch 27 | 2800/ 3181 batches | lr 1.32 | ms/batch 61.36 | loss 3.77 | ppl 43.28\n", - "| epoch 27 | 3000/ 3181 batches | lr 1.32 | ms/batch 61.32 | loss 3.68 | ppl 39.55\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 27 | time: 206.56s | valid loss 5.75 | valid ppl 315.59\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 28 | 200/ 3181 batches | lr 1.25 | ms/batch 61.70 | loss 3.80 | ppl 44.70\n", - "| epoch 28 | 400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.73 | ppl 41.81\n", - "| epoch 28 | 600/ 3181 batches | lr 1.25 | ms/batch 61.43 | loss 3.71 | ppl 40.76\n", - "| epoch 28 | 800/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.75 | ppl 42.56\n", - "| epoch 28 | 1000/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.77 | ppl 43.35\n", - "| epoch 28 | 1200/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.72 | ppl 41.32\n", - "| epoch 28 | 1400/ 3181 batches | lr 1.25 | ms/batch 61.40 | loss 3.75 | ppl 42.65\n", - "| epoch 28 | 1600/ 3181 batches | lr 1.25 | ms/batch 61.34 | loss 3.73 | ppl 41.67\n", - "| epoch 28 | 1800/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.73 | ppl 41.85\n", - "| epoch 28 | 2000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.72 | ppl 41.24\n", - "| epoch 28 | 2200/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.71 | ppl 40.83\n", - "| epoch 28 | 2400/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.72 | ppl 41.20\n", - "| epoch 28 | 2600/ 3181 batches | lr 1.25 | ms/batch 61.39 | loss 3.66 | ppl 38.72\n", - "| epoch 28 | 2800/ 3181 batches | lr 1.25 | ms/batch 61.35 | loss 3.74 | ppl 42.13\n", - "| epoch 28 | 3000/ 3181 batches | lr 1.25 | ms/batch 61.41 | loss 3.65 | ppl 38.37\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 28 | time: 206.71s | valid loss 5.77 | valid ppl 320.59\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 29 | 200/ 3181 batches | lr 1.19 | ms/batch 61.73 | loss 3.77 | ppl 43.52\n", - "| epoch 29 | 400/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.70 | ppl 40.52\n", - "| epoch 29 | 600/ 3181 batches | lr 1.19 | ms/batch 61.39 | loss 3.68 | ppl 39.58\n", - "| epoch 29 | 800/ 3181 batches | lr 1.19 | ms/batch 61.48 | loss 3.72 | ppl 41.41\n", - "| epoch 29 | 1000/ 3181 batches | lr 1.19 | ms/batch 61.35 | loss 3.74 | ppl 42.29\n", - "| epoch 29 | 1200/ 3181 batches | lr 1.19 | ms/batch 61.36 | loss 3.70 | ppl 40.36\n", - "| epoch 29 | 1400/ 3181 batches | lr 1.19 | ms/batch 61.37 | loss 3.73 | ppl 41.64\n", - "| epoch 29 | 1600/ 3181 batches | lr 1.19 | ms/batch 61.40 | loss 3.71 | ppl 40.66\n", - "| epoch 29 | 1800/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.72 | ppl 41.08\n", - "| epoch 29 | 2000/ 3181 batches | lr 1.19 | ms/batch 61.44 | loss 3.69 | ppl 40.20\n", - "| epoch 29 | 2200/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.68 | ppl 39.80\n", - "| epoch 29 | 2400/ 3181 batches | lr 1.19 | ms/batch 61.45 | loss 3.70 | ppl 40.25\n", - "| epoch 29 | 2600/ 3181 batches | lr 1.19 | ms/batch 61.47 | loss 3.63 | ppl 37.79\n", - "| epoch 29 | 2800/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.72 | ppl 41.21\n", - "| epoch 29 | 3000/ 3181 batches | lr 1.19 | ms/batch 61.42 | loss 3.62 | ppl 37.43\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 29 | time: 206.79s | valid loss 5.81 | valid ppl 332.16\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 30 | 200/ 3181 batches | lr 1.13 | ms/batch 61.74 | loss 3.74 | ppl 42.22\n", - "| epoch 30 | 400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.68 | ppl 39.52\n", - "| epoch 30 | 600/ 3181 batches | lr 1.13 | ms/batch 61.41 | loss 3.65 | ppl 38.62\n", - "| epoch 30 | 800/ 3181 batches | lr 1.13 | ms/batch 61.39 | loss 3.70 | ppl 40.47\n", - "| epoch 30 | 1000/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.72 | ppl 41.14\n", - "| epoch 30 | 1200/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.67 | ppl 39.41\n", - "| epoch 30 | 1400/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.71 | ppl 40.66\n", - "| epoch 30 | 1600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.68 | ppl 39.62\n", - "| epoch 30 | 1800/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.69 | ppl 39.97\n", - "| epoch 30 | 2000/ 3181 batches | lr 1.13 | ms/batch 61.36 | loss 3.67 | ppl 39.34\n", - "| epoch 30 | 2200/ 3181 batches | lr 1.13 | ms/batch 61.43 | loss 3.66 | ppl 38.99\n", - "| epoch 30 | 2400/ 3181 batches | lr 1.13 | ms/batch 61.42 | loss 3.66 | ppl 39.01\n", - "| epoch 30 | 2600/ 3181 batches | lr 1.13 | ms/batch 61.40 | loss 3.61 | ppl 36.84\n", - "| epoch 30 | 2800/ 3181 batches | lr 1.13 | ms/batch 61.50 | loss 3.69 | ppl 40.20\n", - "| epoch 30 | 3000/ 3181 batches | lr 1.13 | ms/batch 61.38 | loss 3.60 | ppl 36.54\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 30 | time: 206.80s | valid loss 5.75 | valid ppl 313.98\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 31 | 200/ 3181 batches | lr 1.07 | ms/batch 61.74 | loss 3.72 | ppl 41.43\n", - "| epoch 31 | 400/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.65 | ppl 38.65\n", - "| epoch 31 | 600/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.63 | ppl 37.82\n", - "| epoch 31 | 800/ 3181 batches | lr 1.07 | ms/batch 61.40 | loss 3.68 | ppl 39.51\n", - "| epoch 31 | 1000/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.69 | ppl 40.17\n", - "| epoch 31 | 1200/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.65 | ppl 38.53\n", - "| epoch 31 | 1400/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.69 | ppl 39.93\n", - "| epoch 31 | 1600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.66 | ppl 38.77\n", - "| epoch 31 | 1800/ 3181 batches | lr 1.07 | ms/batch 61.39 | loss 3.67 | ppl 39.17\n", - "| epoch 31 | 2000/ 3181 batches | lr 1.07 | ms/batch 61.49 | loss 3.65 | ppl 38.48\n", - "| epoch 31 | 2200/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.63 | ppl 37.78\n", - "| epoch 31 | 2400/ 3181 batches | lr 1.07 | ms/batch 61.34 | loss 3.65 | ppl 38.35\n", - "| epoch 31 | 2600/ 3181 batches | lr 1.07 | ms/batch 61.41 | loss 3.59 | ppl 36.09\n", - "| epoch 31 | 2800/ 3181 batches | lr 1.07 | ms/batch 61.37 | loss 3.67 | ppl 39.29\n", - "| epoch 31 | 3000/ 3181 batches | lr 1.07 | ms/batch 61.36 | loss 3.57 | ppl 35.60\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 31 | time: 206.68s | valid loss 5.82 | valid ppl 335.54\n", - "-----------------------------------------------------------------------------------------\n" + "| epoch 7 | 8200/13484 batches | lr 3.68 | ms/batch 115.26 | loss 4.92 | ppl 136.77\n", + "| epoch 7 | 8400/13484 batches | lr 3.68 | ms/batch 115.33 | loss 4.94 | ppl 139.78\n", + "| epoch 7 | 8600/13484 batches | lr 3.68 | ms/batch 115.31 | loss 4.95 | ppl 141.01\n", + "| epoch 7 | 8800/13484 batches | lr 3.68 | ms/batch 115.18 | loss 4.93 | ppl 138.80\n", + "| epoch 7 | 9000/13484 batches | lr 3.68 | ms/batch 115.19 | loss 4.95 | ppl 141.73\n", + "| epoch 7 | 9200/13484 batches | lr 3.68 | ms/batch 115.23 | loss 4.97 | ppl 144.05\n", + "| epoch 7 | 9400/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.97 | ppl 144.66\n", + "| epoch 7 | 9600/13484 batches | lr 3.68 | ms/batch 115.26 | loss 4.93 | ppl 138.12\n", + "| epoch 7 | 9800/13484 batches | lr 3.68 | ms/batch 115.27 | loss 4.91 | ppl 135.39\n", + "| epoch 7 | 10000/13484 batches | lr 3.68 | ms/batch 115.19 | loss 4.94 | ppl 140.12\n", + "| epoch 7 | 10200/13484 batches | lr 3.68 | ms/batch 115.27 | loss 4.90 | ppl 134.73\n", + "| epoch 7 | 10400/13484 batches | lr 3.68 | ms/batch 115.29 | loss 4.87 | ppl 130.45\n", + "| epoch 7 | 10600/13484 batches | lr 3.68 | ms/batch 115.35 | loss 4.92 | ppl 137.36\n", + "| epoch 7 | 10800/13484 batches | lr 3.68 | ms/batch 115.29 | loss 4.94 | ppl 140.35\n", + "| epoch 7 | 11000/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.95 | ppl 141.22\n", + "| epoch 7 | 11200/13484 batches | lr 3.68 | ms/batch 115.28 | loss 4.94 | ppl 139.33\n", + "| epoch 7 | 11400/13484 batches | lr 3.68 | ms/batch 115.28 | loss 4.89 | ppl 133.07\n", + "| epoch 7 | 11600/13484 batches | lr 3.68 | ms/batch 115.21 | loss 4.93 | ppl 137.82\n", + "| epoch 7 | 11800/13484 batches | lr 3.68 | ms/batch 115.33 | loss 4.89 | ppl 132.51\n", + "| epoch 7 | 12000/13484 batches | lr 3.68 | ms/batch 115.32 | loss 4.94 | ppl 139.89\n", + "| epoch 7 | 12200/13484 batches | lr 3.68 | ms/batch 115.25 | loss 4.88 | ppl 131.43\n", + "| epoch 7 | 12400/13484 batches | lr 3.68 | ms/batch 115.32 | loss 4.89 | ppl 133.23\n", + "| epoch 7 | 12600/13484 batches | lr 3.68 | ms/batch 115.30 | loss 4.92 | ppl 136.69\n", + "| epoch 7 | 12800/13484 batches | lr 3.68 | ms/batch 115.27 | loss 4.94 | ppl 139.23\n", + "| epoch 7 | 13000/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.92 | ppl 136.46\n", + "| epoch 7 | 13200/13484 batches | lr 3.68 | ms/batch 115.31 | loss 4.92 | ppl 137.53\n", + "| epoch 7 | 13400/13484 batches | lr 3.68 | ms/batch 115.24 | loss 4.94 | ppl 140.23\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 7 | time: 1626.06s | valid loss 5.05 | valid ppl 155.94\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 8 | 200/13484 batches | lr 3.49 | ms/batch 115.85 | loss 4.97 | ppl 143.91\n", + "| epoch 8 | 400/13484 batches | lr 3.49 | ms/batch 115.20 | loss 4.93 | ppl 138.69\n", + "| epoch 8 | 600/13484 batches | lr 3.49 | ms/batch 115.25 | loss 4.92 | ppl 137.31\n", + "| epoch 8 | 800/13484 batches | lr 3.49 | ms/batch 115.21 | loss 4.91 | ppl 135.14\n", + "| epoch 8 | 1000/13484 batches | lr 3.49 | ms/batch 115.09 | loss 4.91 | ppl 136.03\n", + "| epoch 8 | 1200/13484 batches | lr 3.49 | ms/batch 115.24 | loss 4.91 | ppl 135.25\n", + "| epoch 8 | 1400/13484 batches | lr 3.49 | ms/batch 115.16 | loss 4.89 | ppl 132.45\n", + "| epoch 8 | 1600/13484 batches | lr 3.49 | ms/batch 115.19 | loss 4.94 | ppl 139.52\n", + "| epoch 8 | 1800/13484 batches | lr 3.49 | ms/batch 115.13 | loss 4.92 | ppl 136.90\n", + "| epoch 8 | 2000/13484 batches | lr 3.49 | ms/batch 115.23 | loss 4.89 | ppl 132.80\n", + "| epoch 8 | 2200/13484 batches | lr 3.49 | ms/batch 115.12 | loss 4.89 | ppl 132.74\n", + "| epoch 8 | 2400/13484 batches | lr 3.49 | ms/batch 115.25 | loss 4.90 | ppl 133.92\n", + "| epoch 8 | 2600/13484 batches | lr 3.49 | ms/batch 115.27 | loss 4.88 | ppl 131.38\n", + "| epoch 8 | 2800/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.92 | ppl 137.14\n", + "| epoch 8 | 3000/13484 batches | lr 3.49 | ms/batch 115.18 | loss 4.90 | ppl 134.47\n", + "| epoch 8 | 3200/13484 batches | lr 3.49 | ms/batch 115.27 | loss 4.87 | ppl 130.24\n", + "| epoch 8 | 3400/13484 batches | lr 3.49 | ms/batch 115.24 | loss 4.93 | ppl 139.00\n", + "| epoch 8 | 3600/13484 batches | lr 3.49 | ms/batch 115.20 | loss 4.91 | ppl 135.20\n", + "| epoch 8 | 3800/13484 batches | lr 3.49 | ms/batch 115.24 | loss 4.90 | ppl 133.96\n", + "| epoch 8 | 4000/13484 batches | lr 3.49 | ms/batch 115.19 | loss 4.84 | ppl 127.05\n", + "| epoch 8 | 4200/13484 batches | lr 3.49 | ms/batch 115.30 | loss 4.87 | ppl 130.76\n", + "| epoch 8 | 4400/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.88 | ppl 132.28\n", + "| epoch 8 | 4600/13484 batches | lr 3.49 | ms/batch 115.36 | loss 4.93 | ppl 138.46\n", + "| epoch 8 | 4800/13484 batches | lr 3.49 | ms/batch 115.18 | loss 4.91 | ppl 135.37\n", + "| epoch 8 | 5000/13484 batches | lr 3.49 | ms/batch 115.23 | loss 4.90 | ppl 134.12\n", + "| epoch 8 | 5200/13484 batches | lr 3.49 | ms/batch 115.21 | loss 4.90 | ppl 134.65\n", + "| epoch 8 | 5400/13484 batches | lr 3.49 | ms/batch 115.35 | loss 4.87 | ppl 130.93\n", + "| epoch 8 | 5600/13484 batches | lr 3.49 | ms/batch 115.35 | loss 4.89 | ppl 133.28\n", + "| epoch 8 | 5800/13484 batches | lr 3.49 | ms/batch 115.23 | loss 4.89 | ppl 132.54\n", + "| epoch 8 | 6000/13484 batches | lr 3.49 | ms/batch 115.22 | loss 4.91 | ppl 135.15\n", + "| epoch 8 | 6200/13484 batches | lr 3.49 | ms/batch 115.27 | loss 4.94 | ppl 139.25\n", + "| epoch 8 | 6400/13484 batches | lr 3.49 | ms/batch 115.31 | loss 4.91 | ppl 135.37\n", + "| epoch 8 | 6600/13484 batches | lr 3.49 | ms/batch 115.17 | loss 4.94 | ppl 139.28\n", + "| epoch 8 | 6800/13484 batches | lr 3.49 | ms/batch 115.27 | loss 4.88 | ppl 132.05\n", + "| epoch 8 | 7000/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.92 | ppl 137.41\n", + "| epoch 8 | 7200/13484 batches | lr 3.49 | ms/batch 115.34 | loss 4.90 | ppl 133.68\n", + "| epoch 8 | 7400/13484 batches | lr 3.49 | ms/batch 115.27 | loss 4.89 | ppl 133.58\n", + "| epoch 8 | 7600/13484 batches | lr 3.49 | ms/batch 115.26 | loss 4.90 | ppl 133.64\n", + "| epoch 8 | 7800/13484 batches | lr 3.49 | ms/batch 115.33 | loss 4.89 | ppl 133.55\n", + "| epoch 8 | 8000/13484 batches | lr 3.49 | ms/batch 115.17 | loss 4.88 | ppl 132.23\n", + "| epoch 8 | 8200/13484 batches | lr 3.49 | ms/batch 115.25 | loss 4.87 | ppl 129.93\n", + "| epoch 8 | 8400/13484 batches | lr 3.49 | ms/batch 115.31 | loss 4.89 | ppl 133.16\n", + "| epoch 8 | 8600/13484 batches | lr 3.49 | ms/batch 115.30 | loss 4.89 | ppl 133.49\n", + "| epoch 8 | 8800/13484 batches | lr 3.49 | ms/batch 115.31 | loss 4.88 | ppl 131.42\n", + "| epoch 8 | 9000/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.89 | ppl 133.59\n", + "| epoch 8 | 9200/13484 batches | lr 3.49 | ms/batch 115.28 | loss 4.91 | ppl 136.20\n", + "| epoch 8 | 9400/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.91 | ppl 135.54\n", + "| epoch 8 | 9600/13484 batches | lr 3.49 | ms/batch 115.32 | loss 4.88 | ppl 131.19\n", + "| epoch 8 | 9800/13484 batches | lr 3.49 | ms/batch 115.34 | loss 4.86 | ppl 128.72\n", + "| epoch 8 | 10000/13484 batches | lr 3.49 | ms/batch 115.32 | loss 4.89 | ppl 132.80\n", + "| epoch 8 | 10200/13484 batches | lr 3.49 | ms/batch 115.33 | loss 4.85 | ppl 128.25\n", + "| epoch 8 | 10400/13484 batches | lr 3.49 | ms/batch 115.35 | loss 4.83 | ppl 124.93\n", + "| epoch 8 | 10600/13484 batches | lr 3.49 | ms/batch 115.30 | loss 4.87 | ppl 130.59\n", + "| epoch 8 | 10800/13484 batches | lr 3.49 | ms/batch 115.24 | loss 4.90 | ppl 133.78\n", + "| epoch 8 | 11000/13484 batches | lr 3.49 | ms/batch 115.30 | loss 4.90 | ppl 133.75\n", + "| epoch 8 | 11200/13484 batches | lr 3.49 | ms/batch 115.31 | loss 4.89 | ppl 133.33\n", + "| epoch 8 | 11400/13484 batches | lr 3.49 | ms/batch 115.36 | loss 4.84 | ppl 126.25\n", + "| epoch 8 | 11600/13484 batches | lr 3.49 | ms/batch 115.38 | loss 4.88 | ppl 131.70\n", + "| epoch 8 | 11800/13484 batches | lr 3.49 | ms/batch 115.36 | loss 4.84 | ppl 127.09\n", + "| epoch 8 | 12000/13484 batches | lr 3.49 | ms/batch 115.38 | loss 4.89 | ppl 133.44\n", + "| epoch 8 | 12200/13484 batches | lr 3.49 | ms/batch 115.38 | loss 4.83 | ppl 124.78\n", + "| epoch 8 | 12400/13484 batches | lr 3.49 | ms/batch 115.38 | loss 4.84 | ppl 125.91\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 32 | 200/ 3181 batches | lr 1.02 | ms/batch 61.77 | loss 3.70 | ppl 40.52\n", - "| epoch 32 | 400/ 3181 batches | lr 1.02 | ms/batch 61.35 | loss 3.64 | ppl 37.96\n", - "| epoch 32 | 600/ 3181 batches | lr 1.02 | ms/batch 61.39 | loss 3.61 | ppl 36.88\n", - "| epoch 32 | 800/ 3181 batches | lr 1.02 | ms/batch 61.41 | loss 3.66 | ppl 38.80\n", - "| epoch 32 | 1000/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.67 | ppl 39.32\n", - "| epoch 32 | 1200/ 3181 batches | lr 1.02 | ms/batch 61.42 | loss 3.63 | ppl 37.59\n", - "| epoch 32 | 1400/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.66 | ppl 38.96\n", - "| epoch 32 | 1600/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.64 | ppl 38.12\n", - "| epoch 32 | 1800/ 3181 batches | lr 1.02 | ms/batch 61.46 | loss 3.64 | ppl 38.28\n", - "| epoch 32 | 2000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.63 | ppl 37.70\n", - "| epoch 32 | 2200/ 3181 batches | lr 1.02 | ms/batch 61.37 | loss 3.61 | ppl 37.07\n", - "| epoch 32 | 2400/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.62 | ppl 37.39\n", - "| epoch 32 | 2600/ 3181 batches | lr 1.02 | ms/batch 61.38 | loss 3.56 | ppl 35.19\n", - "| epoch 32 | 2800/ 3181 batches | lr 1.02 | ms/batch 61.43 | loss 3.65 | ppl 38.29\n", - "| epoch 32 | 3000/ 3181 batches | lr 1.02 | ms/batch 61.36 | loss 3.55 | ppl 34.89\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 32 | time: 206.72s | valid loss 5.81 | valid ppl 333.52\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 33 | 200/ 3181 batches | lr 0.97 | ms/batch 61.68 | loss 3.68 | ppl 39.71\n", - "| epoch 33 | 400/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 37.00\n", - "| epoch 33 | 600/ 3181 batches | lr 0.97 | ms/batch 61.36 | loss 3.59 | ppl 36.33\n", - "| epoch 33 | 800/ 3181 batches | lr 0.97 | ms/batch 61.35 | loss 3.64 | ppl 38.02\n", - "| epoch 33 | 1000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.65 | ppl 38.54\n", - "| epoch 33 | 1200/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.61 | ppl 37.12\n", - "| epoch 33 | 1400/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.64 | ppl 38.27\n", - "| epoch 33 | 1600/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.26\n", - "| epoch 33 | 1800/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.62 | ppl 37.45\n", - "| epoch 33 | 2000/ 3181 batches | lr 0.97 | ms/batch 61.43 | loss 3.61 | ppl 36.92\n", - "| epoch 33 | 2200/ 3181 batches | lr 0.97 | ms/batch 61.37 | loss 3.59 | ppl 36.34\n", - "| epoch 33 | 2400/ 3181 batches | lr 0.97 | ms/batch 61.41 | loss 3.60 | ppl 36.73\n", - "| epoch 33 | 2600/ 3181 batches | lr 0.97 | ms/batch 61.46 | loss 3.54 | ppl 34.54\n", - "| epoch 33 | 2800/ 3181 batches | lr 0.97 | ms/batch 61.39 | loss 3.62 | ppl 37.42\n", - "| epoch 33 | 3000/ 3181 batches | lr 0.97 | ms/batch 61.45 | loss 3.53 | ppl 34.28\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 33 | time: 206.79s | valid loss 5.84 | valid ppl 345.08\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 34 | 200/ 3181 batches | lr 0.92 | ms/batch 61.72 | loss 3.66 | ppl 38.95\n", - "| epoch 34 | 400/ 3181 batches | lr 0.92 | ms/batch 61.44 | loss 3.60 | ppl 36.53\n", - "| epoch 34 | 600/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.57 | ppl 35.49\n", - "| epoch 34 | 800/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.62 | ppl 37.33\n", - "| epoch 34 | 1000/ 3181 batches | lr 0.92 | ms/batch 61.39 | loss 3.63 | ppl 37.79\n", - "| epoch 34 | 1200/ 3181 batches | lr 0.92 | ms/batch 61.34 | loss 3.59 | ppl 36.16\n", - "| epoch 34 | 1400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.63 | ppl 37.62\n", - "| epoch 34 | 1600/ 3181 batches | lr 0.92 | ms/batch 61.42 | loss 3.60 | ppl 36.58\n", - "| epoch 34 | 1800/ 3181 batches | lr 0.92 | ms/batch 61.37 | loss 3.60 | ppl 36.77\n", - "| epoch 34 | 2000/ 3181 batches | lr 0.92 | ms/batch 61.36 | loss 3.59 | ppl 36.25\n", - "| epoch 34 | 2200/ 3181 batches | lr 0.92 | ms/batch 61.43 | loss 3.58 | ppl 35.76\n", - "| epoch 34 | 2400/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.59 | ppl 36.19\n", - "| epoch 34 | 2600/ 3181 batches | lr 0.92 | ms/batch 61.40 | loss 3.52 | ppl 33.87\n", - "| epoch 34 | 2800/ 3181 batches | lr 0.92 | ms/batch 61.31 | loss 3.61 | ppl 36.90\n", - "| epoch 34 | 3000/ 3181 batches | lr 0.92 | ms/batch 61.41 | loss 3.52 | ppl 33.68\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 34 | time: 206.73s | valid loss 5.83 | valid ppl 341.59\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 35 | 200/ 3181 batches | lr 0.87 | ms/batch 61.62 | loss 3.64 | ppl 38.22\n", - "| epoch 35 | 400/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.82\n", - "| epoch 35 | 600/ 3181 batches | lr 0.87 | ms/batch 61.36 | loss 3.55 | ppl 34.84\n", - "| epoch 35 | 800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.61 | ppl 36.83\n", - "| epoch 35 | 1000/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.62 | ppl 37.16\n", - "| epoch 35 | 1200/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.57 | ppl 35.54\n", - "| epoch 35 | 1400/ 3181 batches | lr 0.87 | ms/batch 61.35 | loss 3.60 | ppl 36.70\n", - "| epoch 35 | 1600/ 3181 batches | lr 0.87 | ms/batch 61.44 | loss 3.58 | ppl 35.97\n", - "| epoch 35 | 1800/ 3181 batches | lr 0.87 | ms/batch 61.42 | loss 3.58 | ppl 35.94\n", - "| epoch 35 | 2000/ 3181 batches | lr 0.87 | ms/batch 61.48 | loss 3.57 | ppl 35.45\n", - "| epoch 35 | 2200/ 3181 batches | lr 0.87 | ms/batch 61.41 | loss 3.56 | ppl 35.07\n", - "| epoch 35 | 2400/ 3181 batches | lr 0.87 | ms/batch 61.37 | loss 3.57 | ppl 35.36\n", - "| epoch 35 | 2600/ 3181 batches | lr 0.87 | ms/batch 61.32 | loss 3.51 | ppl 33.39\n", - "| epoch 35 | 2800/ 3181 batches | lr 0.87 | ms/batch 61.40 | loss 3.59 | ppl 36.19\n", - "| epoch 35 | 3000/ 3181 batches | lr 0.87 | ms/batch 61.39 | loss 3.50 | ppl 33.10\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 35 | time: 206.71s | valid loss 5.84 | valid ppl 345.09\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 36 | 200/ 3181 batches | lr 0.83 | ms/batch 61.71 | loss 3.63 | ppl 37.53\n", - "| epoch 36 | 400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.56 | ppl 35.03\n", - "| epoch 36 | 600/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.53 | ppl 34.18\n", - "| epoch 36 | 800/ 3181 batches | lr 0.83 | ms/batch 61.40 | loss 3.59 | ppl 36.15\n", - "| epoch 36 | 1000/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.60 | ppl 36.47\n", - "| epoch 36 | 1200/ 3181 batches | lr 0.83 | ms/batch 61.36 | loss 3.56 | ppl 35.02\n", - "| epoch 36 | 1400/ 3181 batches | lr 0.83 | ms/batch 61.41 | loss 3.59 | ppl 36.26\n", - "| epoch 36 | 1600/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.41\n", - "| epoch 36 | 1800/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.57 | ppl 35.56\n", - "| epoch 36 | 2000/ 3181 batches | lr 0.83 | ms/batch 61.45 | loss 3.55 | ppl 34.90\n", - "| epoch 36 | 2200/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.54 | ppl 34.47\n", - "| epoch 36 | 2400/ 3181 batches | lr 0.83 | ms/batch 61.35 | loss 3.55 | ppl 34.76\n", - "| epoch 36 | 2600/ 3181 batches | lr 0.83 | ms/batch 61.43 | loss 3.49 | ppl 32.79\n", - "| epoch 36 | 2800/ 3181 batches | lr 0.83 | ms/batch 61.38 | loss 3.57 | ppl 35.60\n", - "| epoch 36 | 3000/ 3181 batches | lr 0.83 | ms/batch 61.44 | loss 3.48 | ppl 32.46\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 36 | time: 206.71s | valid loss 5.83 | valid ppl 339.42\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 37 | 200/ 3181 batches | lr 0.79 | ms/batch 61.75 | loss 3.61 | ppl 37.13\n", - "| epoch 37 | 400/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.54 | ppl 34.55\n", - "| epoch 37 | 600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.51 | ppl 33.58\n" + "| epoch 8 | 12600/13484 batches | lr 3.49 | ms/batch 115.31 | loss 4.86 | ppl 128.83\n", + "| epoch 8 | 12800/13484 batches | lr 3.49 | ms/batch 115.24 | loss 4.88 | ppl 131.60\n", + "| epoch 8 | 13000/13484 batches | lr 3.49 | ms/batch 115.33 | loss 4.87 | ppl 130.10\n", + "| epoch 8 | 13200/13484 batches | lr 3.49 | ms/batch 115.29 | loss 4.88 | ppl 131.87\n", + "| epoch 8 | 13400/13484 batches | lr 3.49 | ms/batch 115.40 | loss 4.90 | ppl 134.29\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 8 | time: 1626.66s | valid loss 5.00 | valid ppl 148.39\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 9 | 200/13484 batches | lr 3.32 | ms/batch 115.97 | loss 4.92 | ppl 136.72\n", + "| epoch 9 | 400/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.88 | ppl 131.62\n", + "| epoch 9 | 600/13484 batches | lr 3.32 | ms/batch 115.30 | loss 4.85 | ppl 128.22\n", + "| epoch 9 | 800/13484 batches | lr 3.32 | ms/batch 115.29 | loss 4.86 | ppl 128.84\n", + "| epoch 9 | 1000/13484 batches | lr 3.32 | ms/batch 115.29 | loss 4.86 | ppl 129.65\n", + "| epoch 9 | 1200/13484 batches | lr 3.32 | ms/batch 115.21 | loss 4.86 | ppl 128.93\n", + "| epoch 9 | 1400/13484 batches | lr 3.32 | ms/batch 115.28 | loss 4.85 | ppl 127.80\n", + "| epoch 9 | 1600/13484 batches | lr 3.32 | ms/batch 115.36 | loss 4.89 | ppl 132.74\n", + "| epoch 9 | 1800/13484 batches | lr 3.32 | ms/batch 115.27 | loss 4.88 | ppl 131.14\n", + "| epoch 9 | 2000/13484 batches | lr 3.32 | ms/batch 115.32 | loss 4.84 | ppl 126.60\n", + "| epoch 9 | 2200/13484 batches | lr 3.32 | ms/batch 115.33 | loss 4.84 | ppl 126.74\n", + "| epoch 9 | 2400/13484 batches | lr 3.32 | ms/batch 115.32 | loss 4.84 | ppl 127.02\n", + "| epoch 9 | 2600/13484 batches | lr 3.32 | ms/batch 115.31 | loss 4.84 | ppl 126.21\n", + "| epoch 9 | 2800/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.87 | ppl 130.53\n", + "| epoch 9 | 3000/13484 batches | lr 3.32 | ms/batch 115.31 | loss 4.85 | ppl 127.68\n", + "| epoch 9 | 3200/13484 batches | lr 3.32 | ms/batch 115.30 | loss 4.83 | ppl 125.33\n", + "| epoch 9 | 3400/13484 batches | lr 3.32 | ms/batch 115.26 | loss 4.89 | ppl 133.40\n", + "| epoch 9 | 3600/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.86 | ppl 129.20\n", + "| epoch 9 | 3800/13484 batches | lr 3.32 | ms/batch 115.39 | loss 4.85 | ppl 127.67\n", + "| epoch 9 | 4000/13484 batches | lr 3.32 | ms/batch 115.39 | loss 4.80 | ppl 121.75\n", + "| epoch 9 | 4200/13484 batches | lr 3.32 | ms/batch 115.30 | loss 4.83 | ppl 125.31\n", + "| epoch 9 | 4400/13484 batches | lr 3.32 | ms/batch 115.44 | loss 4.84 | ppl 126.39\n", + "| epoch 9 | 4600/13484 batches | lr 3.32 | ms/batch 115.29 | loss 4.88 | ppl 131.31\n", + "| epoch 9 | 4800/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.86 | ppl 129.65\n", + "| epoch 9 | 5000/13484 batches | lr 3.32 | ms/batch 115.37 | loss 4.85 | ppl 128.14\n", + "| epoch 9 | 5200/13484 batches | lr 3.32 | ms/batch 115.37 | loss 4.85 | ppl 128.35\n", + "| epoch 9 | 5400/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.83 | ppl 124.69\n", + "| epoch 9 | 5600/13484 batches | lr 3.32 | ms/batch 115.47 | loss 4.85 | ppl 127.26\n", + "| epoch 9 | 5800/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.85 | ppl 127.16\n", + "| epoch 9 | 6000/13484 batches | lr 3.32 | ms/batch 115.45 | loss 4.86 | ppl 128.86\n", + "| epoch 9 | 6200/13484 batches | lr 3.32 | ms/batch 115.41 | loss 4.88 | ppl 131.85\n", + "| epoch 9 | 6400/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.86 | ppl 129.32\n", + "| epoch 9 | 6600/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.89 | ppl 132.53\n", + "| epoch 9 | 6800/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.83 | ppl 125.33\n", + "| epoch 9 | 7000/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.88 | ppl 131.81\n", + "| epoch 9 | 7200/13484 batches | lr 3.32 | ms/batch 115.35 | loss 4.85 | ppl 127.28\n", + "| epoch 9 | 7400/13484 batches | lr 3.32 | ms/batch 115.42 | loss 4.85 | ppl 127.94\n", + "| epoch 9 | 7600/13484 batches | lr 3.32 | ms/batch 115.42 | loss 4.85 | ppl 127.51\n", + "| epoch 9 | 7800/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.85 | ppl 127.59\n", + "| epoch 9 | 8000/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.84 | ppl 126.24\n", + "| epoch 9 | 8200/13484 batches | lr 3.32 | ms/batch 115.46 | loss 4.82 | ppl 124.14\n", + "| epoch 9 | 8400/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.85 | ppl 127.32\n", + "| epoch 9 | 8600/13484 batches | lr 3.32 | ms/batch 115.37 | loss 4.86 | ppl 128.81\n", + "| epoch 9 | 8800/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.83 | ppl 125.56\n", + "| epoch 9 | 9000/13484 batches | lr 3.32 | ms/batch 115.35 | loss 4.85 | ppl 128.24\n", + "| epoch 9 | 9200/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.87 | ppl 130.12\n", + "| epoch 9 | 9400/13484 batches | lr 3.32 | ms/batch 115.41 | loss 4.86 | ppl 129.31\n", + "| epoch 9 | 9600/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.84 | ppl 126.04\n", + "| epoch 9 | 9800/13484 batches | lr 3.32 | ms/batch 115.47 | loss 4.81 | ppl 122.88\n", + "| epoch 9 | 10000/13484 batches | lr 3.32 | ms/batch 115.43 | loss 4.84 | ppl 126.54\n", + "| epoch 9 | 10200/13484 batches | lr 3.32 | ms/batch 115.43 | loss 4.80 | ppl 121.48\n", + "| epoch 9 | 10400/13484 batches | lr 3.32 | ms/batch 115.35 | loss 4.78 | ppl 118.65\n", + "| epoch 9 | 10600/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.83 | ppl 124.64\n", + "| epoch 9 | 10800/13484 batches | lr 3.32 | ms/batch 115.47 | loss 4.85 | ppl 127.13\n", + "| epoch 9 | 11000/13484 batches | lr 3.32 | ms/batch 115.47 | loss 4.85 | ppl 127.77\n", + "| epoch 9 | 11200/13484 batches | lr 3.32 | ms/batch 115.41 | loss 4.84 | ppl 126.57\n", + "| epoch 9 | 11400/13484 batches | lr 3.32 | ms/batch 115.41 | loss 4.79 | ppl 120.87\n", + "| epoch 9 | 11600/13484 batches | lr 3.32 | ms/batch 115.33 | loss 4.83 | ppl 125.52\n", + "| epoch 9 | 11800/13484 batches | lr 3.32 | ms/batch 115.34 | loss 4.80 | ppl 120.94\n", + "| epoch 9 | 12000/13484 batches | lr 3.32 | ms/batch 115.43 | loss 4.85 | ppl 127.12\n", + "| epoch 9 | 12200/13484 batches | lr 3.32 | ms/batch 115.41 | loss 4.78 | ppl 119.42\n", + "| epoch 9 | 12400/13484 batches | lr 3.32 | ms/batch 115.45 | loss 4.79 | ppl 120.40\n", + "| epoch 9 | 12600/13484 batches | lr 3.32 | ms/batch 115.44 | loss 4.82 | ppl 124.30\n", + "| epoch 9 | 12800/13484 batches | lr 3.32 | ms/batch 115.40 | loss 4.84 | ppl 126.27\n", + "| epoch 9 | 13000/13484 batches | lr 3.32 | ms/batch 115.38 | loss 4.83 | ppl 125.13\n", + "| epoch 9 | 13200/13484 batches | lr 3.32 | ms/batch 115.39 | loss 4.83 | ppl 125.83\n", + "| epoch 9 | 13400/13484 batches | lr 3.32 | ms/batch 115.42 | loss 4.85 | ppl 128.06\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 9 | time: 1628.05s | valid loss 5.02 | valid ppl 150.80\n", + "-----------------------------------------------------------------------------------------\n", + "| epoch 10 | 200/13484 batches | lr 3.15 | ms/batch 116.03 | loss 4.88 | ppl 131.35\n", + "| epoch 10 | 400/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.84 | ppl 126.70\n", + "| epoch 10 | 600/13484 batches | lr 3.15 | ms/batch 115.47 | loss 4.82 | ppl 124.18\n", + "| epoch 10 | 800/13484 batches | lr 3.15 | ms/batch 115.45 | loss 4.82 | ppl 123.47\n", + "| epoch 10 | 1000/13484 batches | lr 3.15 | ms/batch 115.31 | loss 4.82 | ppl 124.52\n", + "| epoch 10 | 1200/13484 batches | lr 3.15 | ms/batch 115.36 | loss 4.83 | ppl 124.69\n", + "| epoch 10 | 1400/13484 batches | lr 3.15 | ms/batch 115.49 | loss 4.81 | ppl 122.50\n", + "| epoch 10 | 1600/13484 batches | lr 3.15 | ms/batch 115.44 | loss 4.85 | ppl 127.35\n", + "| epoch 10 | 1800/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.83 | ppl 124.97\n", + "| epoch 10 | 2000/13484 batches | lr 3.15 | ms/batch 115.45 | loss 4.80 | ppl 121.45\n", + "| epoch 10 | 2200/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.80 | ppl 121.97\n", + "| epoch 10 | 2400/13484 batches | lr 3.15 | ms/batch 115.37 | loss 4.80 | ppl 122.05\n", + "| epoch 10 | 2600/13484 batches | lr 3.15 | ms/batch 115.46 | loss 4.79 | ppl 120.16\n", + "| epoch 10 | 2800/13484 batches | lr 3.15 | ms/batch 115.42 | loss 4.83 | ppl 125.44\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "| epoch 37 | 800/ 3181 batches | lr 0.79 | ms/batch 61.32 | loss 3.57 | ppl 35.62\n", - "| epoch 37 | 1000/ 3181 batches | lr 0.79 | ms/batch 61.30 | loss 3.58 | ppl 35.97\n", - "| epoch 37 | 1200/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.54 | ppl 34.53\n", - "| epoch 37 | 1400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.58 | ppl 35.77\n", - "| epoch 37 | 1600/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.55 | ppl 34.77\n", - "| epoch 37 | 1800/ 3181 batches | lr 0.79 | ms/batch 61.40 | loss 3.55 | ppl 34.93\n", - "| epoch 37 | 2000/ 3181 batches | lr 0.79 | ms/batch 61.41 | loss 3.53 | ppl 34.29\n", - "| epoch 37 | 2200/ 3181 batches | lr 0.79 | ms/batch 61.45 | loss 3.53 | ppl 34.02\n", - "| epoch 37 | 2400/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.53 | ppl 34.29\n", - "| epoch 37 | 2600/ 3181 batches | lr 0.79 | ms/batch 61.37 | loss 3.48 | ppl 32.30\n", - "| epoch 37 | 2800/ 3181 batches | lr 0.79 | ms/batch 61.42 | loss 3.56 | ppl 35.19\n", - "| epoch 37 | 3000/ 3181 batches | lr 0.79 | ms/batch 61.38 | loss 3.47 | ppl 32.07\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 37 | time: 206.68s | valid loss 5.87 | valid ppl 352.86\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 38 | 200/ 3181 batches | lr 0.75 | ms/batch 61.67 | loss 3.59 | ppl 36.37\n", - "| epoch 38 | 400/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.53 | ppl 34.01\n", - "| epoch 38 | 600/ 3181 batches | lr 0.75 | ms/batch 61.39 | loss 3.50 | ppl 33.04\n", - "| epoch 38 | 800/ 3181 batches | lr 0.75 | ms/batch 61.38 | loss 3.56 | ppl 35.12\n", - "| epoch 38 | 1000/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.57 | ppl 35.51\n", - "| epoch 38 | 1200/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n", - "| epoch 38 | 1400/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.56 | ppl 35.27\n", - "| epoch 38 | 1600/ 3181 batches | lr 0.75 | ms/batch 61.46 | loss 3.54 | ppl 34.43\n", - "| epoch 38 | 1800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.47\n", - "| epoch 38 | 2000/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.52 | ppl 33.89\n", - "| epoch 38 | 2200/ 3181 batches | lr 0.75 | ms/batch 61.44 | loss 3.52 | ppl 33.72\n", - "| epoch 38 | 2400/ 3181 batches | lr 0.75 | ms/batch 61.41 | loss 3.52 | ppl 33.79\n", - "| epoch 38 | 2600/ 3181 batches | lr 0.75 | ms/batch 61.42 | loss 3.46 | ppl 31.89\n", - "| epoch 38 | 2800/ 3181 batches | lr 0.75 | ms/batch 61.40 | loss 3.54 | ppl 34.41\n", - "| epoch 38 | 3000/ 3181 batches | lr 0.75 | ms/batch 61.50 | loss 3.45 | ppl 31.48\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 38 | time: 206.80s | valid loss 5.88 | valid ppl 358.16\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 39 | 200/ 3181 batches | lr 0.71 | ms/batch 61.71 | loss 3.58 | ppl 36.05\n", - "| epoch 39 | 400/ 3181 batches | lr 0.71 | ms/batch 61.40 | loss 3.52 | ppl 33.71\n", - "| epoch 39 | 600/ 3181 batches | lr 0.71 | ms/batch 61.44 | loss 3.49 | ppl 32.73\n", - "| epoch 39 | 800/ 3181 batches | lr 0.71 | ms/batch 61.34 | loss 3.54 | ppl 34.55\n", - "| epoch 39 | 1000/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.56 | ppl 35.03\n", - "| epoch 39 | 1200/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.38\n", - "| epoch 39 | 1400/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.55 | ppl 34.77\n", - "| epoch 39 | 1600/ 3181 batches | lr 0.71 | ms/batch 61.43 | loss 3.52 | ppl 33.79\n", - "| epoch 39 | 1800/ 3181 batches | lr 0.71 | ms/batch 61.46 | loss 3.52 | ppl 33.85\n", - "| epoch 39 | 2000/ 3181 batches | lr 0.71 | ms/batch 61.41 | loss 3.51 | ppl 33.32\n", - "| epoch 39 | 2200/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.50 | ppl 33.21\n", - "| epoch 39 | 2400/ 3181 batches | lr 0.71 | ms/batch 61.38 | loss 3.51 | ppl 33.39\n", - "| epoch 39 | 2600/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.45 | ppl 31.48\n", - "| epoch 39 | 2800/ 3181 batches | lr 0.71 | ms/batch 61.37 | loss 3.53 | ppl 34.01\n", - "| epoch 39 | 3000/ 3181 batches | lr 0.71 | ms/batch 61.42 | loss 3.43 | ppl 30.93\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 39 | time: 206.75s | valid loss 5.92 | valid ppl 370.78\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 40 | 200/ 3181 batches | lr 0.68 | ms/batch 61.74 | loss 3.57 | ppl 35.52\n", - "| epoch 40 | 400/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.50 | ppl 33.05\n", - "| epoch 40 | 600/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.47 | ppl 32.19\n", - "| epoch 40 | 800/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.53 | ppl 34.18\n", - "| epoch 40 | 1000/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.54 | ppl 34.37\n", - "| epoch 40 | 1200/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.50 | ppl 33.04\n", - "| epoch 40 | 1400/ 3181 batches | lr 0.68 | ms/batch 61.45 | loss 3.53 | ppl 34.21\n", - "| epoch 40 | 1600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.51 | ppl 33.31\n", - "| epoch 40 | 1800/ 3181 batches | lr 0.68 | ms/batch 61.37 | loss 3.51 | ppl 33.39\n", - "| epoch 40 | 2000/ 3181 batches | lr 0.68 | ms/batch 61.44 | loss 3.50 | ppl 32.98\n", - "| epoch 40 | 2200/ 3181 batches | lr 0.68 | ms/batch 61.40 | loss 3.49 | ppl 32.67\n", - "| epoch 40 | 2400/ 3181 batches | lr 0.68 | ms/batch 61.43 | loss 3.49 | ppl 32.91\n", - "| epoch 40 | 2600/ 3181 batches | lr 0.68 | ms/batch 61.42 | loss 3.44 | ppl 31.10\n", - "| epoch 40 | 2800/ 3181 batches | lr 0.68 | ms/batch 61.34 | loss 3.51 | ppl 33.46\n", - "| epoch 40 | 3000/ 3181 batches | lr 0.68 | ms/batch 61.28 | loss 3.42 | ppl 30.62\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 40 | time: 206.76s | valid loss 5.93 | valid ppl 376.29\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 41 | 200/ 3181 batches | lr 0.64 | ms/batch 61.65 | loss 3.56 | ppl 35.06\n", - "| epoch 41 | 400/ 3181 batches | lr 0.64 | ms/batch 61.47 | loss 3.49 | ppl 32.70\n", - "| epoch 41 | 600/ 3181 batches | lr 0.64 | ms/batch 61.39 | loss 3.45 | ppl 31.62\n", - "| epoch 41 | 800/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.52 | ppl 33.65\n", - "| epoch 41 | 1000/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.53 | ppl 33.98\n", - "| epoch 41 | 1200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.49 | ppl 32.75\n", - "| epoch 41 | 1400/ 3181 batches | lr 0.64 | ms/batch 61.41 | loss 3.52 | ppl 33.71\n", - "| epoch 41 | 1600/ 3181 batches | lr 0.64 | ms/batch 61.45 | loss 3.50 | ppl 33.06\n", - "| epoch 41 | 1800/ 3181 batches | lr 0.64 | ms/batch 61.35 | loss 3.50 | ppl 33.27\n", - "| epoch 41 | 2000/ 3181 batches | lr 0.64 | ms/batch 61.43 | loss 3.49 | ppl 32.72\n", - "| epoch 41 | 2200/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.47 | ppl 32.12\n", - "| epoch 41 | 2400/ 3181 batches | lr 0.64 | ms/batch 61.42 | loss 3.48 | ppl 32.54\n", - "| epoch 41 | 2600/ 3181 batches | lr 0.64 | ms/batch 61.44 | loss 3.43 | ppl 30.92\n", - "| epoch 41 | 2800/ 3181 batches | lr 0.64 | ms/batch 61.37 | loss 3.50 | ppl 33.15\n", - "| epoch 41 | 3000/ 3181 batches | lr 0.64 | ms/batch 61.38 | loss 3.41 | ppl 30.36\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 41 | time: 206.74s | valid loss 5.91 | valid ppl 369.07\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 42 | 200/ 3181 batches | lr 0.61 | ms/batch 61.72 | loss 3.55 | ppl 34.66\n", - "| epoch 42 | 400/ 3181 batches | lr 0.61 | ms/batch 61.40 | loss 3.48 | ppl 32.31\n", - "| epoch 42 | 600/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.45 | ppl 31.42\n", - "| epoch 42 | 800/ 3181 batches | lr 0.61 | ms/batch 61.36 | loss 3.51 | ppl 33.32\n", - "| epoch 42 | 1000/ 3181 batches | lr 0.61 | ms/batch 61.42 | loss 3.52 | ppl 33.79\n", - "| epoch 42 | 1200/ 3181 batches | lr 0.61 | ms/batch 61.43 | loss 3.47 | ppl 32.12\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| epoch 42 | 1400/ 3181 batches | lr 0.61 | ms/batch 61.45 | loss 3.50 | ppl 33.28\n", - "| epoch 42 | 1600/ 3181 batches | lr 0.61 | ms/batch 61.41 | loss 3.49 | ppl 32.66\n", - "| epoch 42 | 1800/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.49 | ppl 32.80\n", - "| epoch 42 | 2000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.47 | ppl 32.27\n", - "| epoch 42 | 2200/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.46 | ppl 31.79\n", - "| epoch 42 | 2400/ 3181 batches | lr 0.61 | ms/batch 61.44 | loss 3.48 | ppl 32.32\n", - "| epoch 42 | 2600/ 3181 batches | lr 0.61 | ms/batch 61.39 | loss 3.42 | ppl 30.42\n", - "| epoch 42 | 2800/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.50 | ppl 32.97\n", - "| epoch 42 | 3000/ 3181 batches | lr 0.61 | ms/batch 61.37 | loss 3.40 | ppl 29.94\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 42 | time: 206.74s | valid loss 5.92 | valid ppl 371.93\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 43 | 200/ 3181 batches | lr 0.58 | ms/batch 61.68 | loss 3.53 | ppl 34.15\n", - "| epoch 43 | 400/ 3181 batches | lr 0.58 | ms/batch 61.40 | loss 3.47 | ppl 32.05\n", - "| epoch 43 | 600/ 3181 batches | lr 0.58 | ms/batch 61.38 | loss 3.44 | ppl 31.09\n", - "| epoch 43 | 800/ 3181 batches | lr 0.58 | ms/batch 61.37 | loss 3.50 | ppl 33.06\n", - "| epoch 43 | 1000/ 3181 batches | lr 0.58 | ms/batch 61.42 | loss 3.51 | ppl 33.36\n", - "| epoch 43 | 1200/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 31.98\n", - "| epoch 43 | 1400/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.50 | ppl 32.97\n", - "| epoch 43 | 1600/ 3181 batches | lr 0.58 | ms/batch 61.41 | loss 3.47 | ppl 32.29\n", - "| epoch 43 | 1800/ 3181 batches | lr 0.58 | ms/batch 61.46 | loss 3.47 | ppl 32.22\n", - "| epoch 43 | 2000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.46 | ppl 31.72\n", - "| epoch 43 | 2200/ 3181 batches | lr 0.58 | ms/batch 61.33 | loss 3.45 | ppl 31.37\n", - "| epoch 43 | 2400/ 3181 batches | lr 0.58 | ms/batch 61.36 | loss 3.46 | ppl 31.81\n", - "| epoch 43 | 2600/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.41 | ppl 30.17\n", - "| epoch 43 | 2800/ 3181 batches | lr 0.58 | ms/batch 61.44 | loss 3.49 | ppl 32.63\n", - "| epoch 43 | 3000/ 3181 batches | lr 0.58 | ms/batch 61.39 | loss 3.39 | ppl 29.80\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 43 | time: 206.72s | valid loss 5.98 | valid ppl 394.70\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 44 | 200/ 3181 batches | lr 0.55 | ms/batch 61.69 | loss 3.52 | ppl 33.79\n", - "| epoch 44 | 400/ 3181 batches | lr 0.55 | ms/batch 61.47 | loss 3.46 | ppl 31.81\n", - "| epoch 44 | 600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.42 | ppl 30.69\n", - "| epoch 44 | 800/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.48 | ppl 32.49\n", - "| epoch 44 | 1000/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.49 | ppl 32.84\n", - "| epoch 44 | 1200/ 3181 batches | lr 0.55 | ms/batch 61.43 | loss 3.45 | ppl 31.56\n", - "| epoch 44 | 1400/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.48 | ppl 32.59\n", - "| epoch 44 | 1600/ 3181 batches | lr 0.55 | ms/batch 61.42 | loss 3.46 | ppl 31.93\n", - "| epoch 44 | 1800/ 3181 batches | lr 0.55 | ms/batch 61.41 | loss 3.46 | ppl 31.94\n", - "| epoch 44 | 2000/ 3181 batches | lr 0.55 | ms/batch 61.36 | loss 3.45 | ppl 31.62\n", - "| epoch 44 | 2200/ 3181 batches | lr 0.55 | ms/batch 61.37 | loss 3.44 | ppl 31.16\n", - "| epoch 44 | 2400/ 3181 batches | lr 0.55 | ms/batch 61.35 | loss 3.45 | ppl 31.47\n", - "| epoch 44 | 2600/ 3181 batches | lr 0.55 | ms/batch 61.40 | loss 3.39 | ppl 29.77\n", - "| epoch 44 | 2800/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.47 | ppl 32.19\n", - "| epoch 44 | 3000/ 3181 batches | lr 0.55 | ms/batch 61.44 | loss 3.38 | ppl 29.40\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 44 | time: 206.75s | valid loss 5.96 | valid ppl 389.15\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 45 | 200/ 3181 batches | lr 0.52 | ms/batch 61.74 | loss 3.51 | ppl 33.51\n", - "| epoch 45 | 400/ 3181 batches | lr 0.52 | ms/batch 61.41 | loss 3.45 | ppl 31.39\n", - "| epoch 45 | 600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.42 | ppl 30.55\n", - "| epoch 45 | 800/ 3181 batches | lr 0.52 | ms/batch 61.35 | loss 3.48 | ppl 32.50\n", - "| epoch 45 | 1000/ 3181 batches | lr 0.52 | ms/batch 61.38 | loss 3.48 | ppl 32.51\n", - "| epoch 45 | 1200/ 3181 batches | lr 0.52 | ms/batch 61.33 | loss 3.45 | ppl 31.39\n", - "| epoch 45 | 1400/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.47 | ppl 32.20\n", - "| epoch 45 | 1600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.45 | ppl 31.63\n", - "| epoch 45 | 1800/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.45 | ppl 31.61\n", - "| epoch 45 | 2000/ 3181 batches | lr 0.52 | ms/batch 61.36 | loss 3.44 | ppl 31.17\n", - "| epoch 45 | 2200/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.43 | ppl 31.02\n", - "| epoch 45 | 2400/ 3181 batches | lr 0.52 | ms/batch 61.45 | loss 3.44 | ppl 31.18\n", - "| epoch 45 | 2600/ 3181 batches | lr 0.52 | ms/batch 61.40 | loss 3.39 | ppl 29.52\n", - "| epoch 45 | 2800/ 3181 batches | lr 0.52 | ms/batch 61.43 | loss 3.46 | ppl 31.72\n", - "| epoch 45 | 3000/ 3181 batches | lr 0.52 | ms/batch 61.48 | loss 3.37 | ppl 29.15\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 45 | time: 206.77s | valid loss 5.99 | valid ppl 398.09\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 46 | 200/ 3181 batches | lr 0.50 | ms/batch 61.71 | loss 3.50 | ppl 33.04\n", - "| epoch 46 | 400/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.44 | ppl 31.04\n", - "| epoch 46 | 600/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.41 | ppl 30.26\n", - "| epoch 46 | 800/ 3181 batches | lr 0.50 | ms/batch 61.47 | loss 3.47 | ppl 32.01\n", - "| epoch 46 | 1000/ 3181 batches | lr 0.50 | ms/batch 61.39 | loss 3.47 | ppl 32.08\n", - "| epoch 46 | 1200/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.43 | ppl 30.86\n", - "| epoch 46 | 1400/ 3181 batches | lr 0.50 | ms/batch 61.34 | loss 3.47 | ppl 32.15\n", - "| epoch 46 | 1600/ 3181 batches | lr 0.50 | ms/batch 61.44 | loss 3.44 | ppl 31.32\n", - "| epoch 46 | 1800/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.45 | ppl 31.49\n", - "| epoch 46 | 2000/ 3181 batches | lr 0.50 | ms/batch 61.42 | loss 3.44 | ppl 31.04\n", - "| epoch 46 | 2200/ 3181 batches | lr 0.50 | ms/batch 61.48 | loss 3.42 | ppl 30.63\n", - "| epoch 46 | 2400/ 3181 batches | lr 0.50 | ms/batch 61.35 | loss 3.43 | ppl 30.95\n", - "| epoch 46 | 2600/ 3181 batches | lr 0.50 | ms/batch 61.37 | loss 3.38 | ppl 29.38\n", - "| epoch 46 | 2800/ 3181 batches | lr 0.50 | ms/batch 61.41 | loss 3.45 | ppl 31.45\n", - "| epoch 46 | 3000/ 3181 batches | lr 0.50 | ms/batch 61.45 | loss 3.37 | ppl 28.96\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 46 | time: 206.77s | valid loss 5.96 | valid ppl 389.00\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 47 | 200/ 3181 batches | lr 0.47 | ms/batch 61.74 | loss 3.49 | ppl 32.78\n", - "| epoch 47 | 400/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.76\n", - "| epoch 47 | 600/ 3181 batches | lr 0.47 | ms/batch 61.47 | loss 3.40 | ppl 29.86\n", - "| epoch 47 | 800/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.86\n", - "| epoch 47 | 1000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.46 | ppl 31.90\n", - "| epoch 47 | 1200/ 3181 batches | lr 0.47 | ms/batch 61.38 | loss 3.42 | ppl 30.71\n", - "| epoch 47 | 1400/ 3181 batches | lr 0.47 | ms/batch 61.39 | loss 3.46 | ppl 31.91\n", - "| epoch 47 | 1600/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 31.00\n", - "| epoch 47 | 1800/ 3181 batches | lr 0.47 | ms/batch 61.54 | loss 3.44 | ppl 31.18\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "| epoch 47 | 2000/ 3181 batches | lr 0.47 | ms/batch 61.48 | loss 3.43 | ppl 30.73\n", - "| epoch 47 | 2200/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.41 | ppl 30.37\n", - "| epoch 47 | 2400/ 3181 batches | lr 0.47 | ms/batch 61.52 | loss 3.42 | ppl 30.63\n", - "| epoch 47 | 2600/ 3181 batches | lr 0.47 | ms/batch 61.49 | loss 3.37 | ppl 28.98\n", - "| epoch 47 | 2800/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.45 | ppl 31.34\n", - "| epoch 47 | 3000/ 3181 batches | lr 0.47 | ms/batch 61.43 | loss 3.35 | ppl 28.50\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 47 | time: 206.90s | valid loss 5.96 | valid ppl 388.68\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 48 | 200/ 3181 batches | lr 0.45 | ms/batch 61.70 | loss 3.48 | ppl 32.61\n", - "| epoch 48 | 400/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.51\n", - "| epoch 48 | 600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.39 | ppl 29.65\n", - "| epoch 48 | 800/ 3181 batches | lr 0.45 | ms/batch 61.36 | loss 3.46 | ppl 31.70\n", - "| epoch 48 | 1000/ 3181 batches | lr 0.45 | ms/batch 61.50 | loss 3.46 | ppl 31.66\n", - "| epoch 48 | 1200/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.42 | ppl 30.56\n", - "| epoch 48 | 1400/ 3181 batches | lr 0.45 | ms/batch 61.49 | loss 3.45 | ppl 31.65\n", - "| epoch 48 | 1600/ 3181 batches | lr 0.45 | ms/batch 61.43 | loss 3.42 | ppl 30.66\n", - "| epoch 48 | 1800/ 3181 batches | lr 0.45 | ms/batch 61.41 | loss 3.43 | ppl 30.74\n", - "| epoch 48 | 2000/ 3181 batches | lr 0.45 | ms/batch 61.40 | loss 3.42 | ppl 30.48\n", - "| epoch 48 | 2200/ 3181 batches | lr 0.45 | ms/batch 61.47 | loss 3.41 | ppl 30.33\n", - "| epoch 48 | 2400/ 3181 batches | lr 0.45 | ms/batch 61.38 | loss 3.41 | ppl 30.38\n", - "| epoch 48 | 2600/ 3181 batches | lr 0.45 | ms/batch 61.46 | loss 3.36 | ppl 28.89\n", - "| epoch 48 | 2800/ 3181 batches | lr 0.45 | ms/batch 61.42 | loss 3.43 | ppl 30.96\n", - "| epoch 48 | 3000/ 3181 batches | lr 0.45 | ms/batch 61.39 | loss 3.34 | ppl 28.33\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 48 | time: 206.81s | valid loss 5.95 | valid ppl 383.84\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 49 | 200/ 3181 batches | lr 0.43 | ms/batch 61.73 | loss 3.47 | ppl 32.23\n", - "| epoch 49 | 400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.33\n", - "| epoch 49 | 600/ 3181 batches | lr 0.43 | ms/batch 61.43 | loss 3.38 | ppl 29.52\n", - "| epoch 49 | 800/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.45 | ppl 31.40\n", - "| epoch 49 | 1000/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.46 | ppl 31.72\n", - "| epoch 49 | 1200/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.41 | ppl 30.25\n", - "| epoch 49 | 1400/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.45 | ppl 31.41\n", - "| epoch 49 | 1600/ 3181 batches | lr 0.43 | ms/batch 61.40 | loss 3.42 | ppl 30.56\n", - "| epoch 49 | 1800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.75\n", - "| epoch 49 | 2000/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.41 | ppl 30.28\n", - "| epoch 49 | 2200/ 3181 batches | lr 0.43 | ms/batch 61.45 | loss 3.41 | ppl 30.14\n", - "| epoch 49 | 2400/ 3181 batches | lr 0.43 | ms/batch 61.47 | loss 3.41 | ppl 30.22\n", - "| epoch 49 | 2600/ 3181 batches | lr 0.43 | ms/batch 61.44 | loss 3.35 | ppl 28.48\n", - "| epoch 49 | 2800/ 3181 batches | lr 0.43 | ms/batch 61.39 | loss 3.43 | ppl 30.72\n", - "| epoch 49 | 3000/ 3181 batches | lr 0.43 | ms/batch 61.37 | loss 3.34 | ppl 28.18\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 49 | time: 206.79s | valid loss 5.98 | valid ppl 395.98\n", - "-----------------------------------------------------------------------------------------\n", - "| epoch 50 | 200/ 3181 batches | lr 0.40 | ms/batch 61.71 | loss 3.47 | ppl 32.06\n", - "| epoch 50 | 400/ 3181 batches | lr 0.40 | ms/batch 61.39 | loss 3.41 | ppl 30.15\n", - "| epoch 50 | 600/ 3181 batches | lr 0.40 | ms/batch 61.37 | loss 3.38 | ppl 29.27\n", - "| epoch 50 | 800/ 3181 batches | lr 0.40 | ms/batch 61.42 | loss 3.43 | ppl 31.02\n", - "| epoch 50 | 1000/ 3181 batches | lr 0.40 | ms/batch 61.34 | loss 3.44 | ppl 31.16\n", - "| epoch 50 | 1200/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 29.97\n", - "| epoch 50 | 1400/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.44 | ppl 31.23\n", - "| epoch 50 | 1600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.41 | ppl 30.24\n", - "| epoch 50 | 1800/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.42 | ppl 30.64\n", - "| epoch 50 | 2000/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.40 | ppl 30.07\n", - "| epoch 50 | 2200/ 3181 batches | lr 0.40 | ms/batch 61.49 | loss 3.39 | ppl 29.78\n", - "| epoch 50 | 2400/ 3181 batches | lr 0.40 | ms/batch 61.41 | loss 3.40 | ppl 29.98\n", - "| epoch 50 | 2600/ 3181 batches | lr 0.40 | ms/batch 61.43 | loss 3.35 | ppl 28.40\n", - "| epoch 50 | 2800/ 3181 batches | lr 0.40 | ms/batch 61.38 | loss 3.43 | ppl 30.72\n", - "| epoch 50 | 3000/ 3181 batches | lr 0.40 | ms/batch 61.44 | loss 3.34 | ppl 28.08\n", - "-----------------------------------------------------------------------------------------\n", - "| end of epoch 50 | time: 206.77s | valid loss 6.01 | valid ppl 407.69\n", + "| epoch 10 | 3000/13484 batches | lr 3.15 | ms/batch 115.36 | loss 4.81 | ppl 122.12\n", + "| epoch 10 | 3200/13484 batches | lr 3.15 | ms/batch 115.44 | loss 4.79 | ppl 120.18\n", + "| epoch 10 | 3400/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.84 | ppl 127.05\n", + "| epoch 10 | 3600/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.82 | ppl 123.70\n", + "| epoch 10 | 3800/13484 batches | lr 3.15 | ms/batch 115.40 | loss 4.80 | ppl 121.74\n", + "| epoch 10 | 4000/13484 batches | lr 3.15 | ms/batch 115.49 | loss 4.76 | ppl 116.53\n", + "| epoch 10 | 4200/13484 batches | lr 3.15 | ms/batch 115.42 | loss 4.78 | ppl 119.64\n", + "| epoch 10 | 4400/13484 batches | lr 3.15 | ms/batch 115.36 | loss 4.80 | ppl 121.17\n", + "| epoch 10 | 4600/13484 batches | lr 3.15 | ms/batch 115.42 | loss 4.84 | ppl 126.61\n", + "| epoch 10 | 4800/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.83 | ppl 124.71\n", + "| epoch 10 | 5000/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.81 | ppl 122.89\n", + "| epoch 10 | 5200/13484 batches | lr 3.15 | ms/batch 115.36 | loss 4.81 | ppl 123.00\n", + "| epoch 10 | 5400/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.79 | ppl 120.50\n", + "| epoch 10 | 5600/13484 batches | lr 3.15 | ms/batch 115.38 | loss 4.80 | ppl 121.56\n", + "| epoch 10 | 5800/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.80 | ppl 121.20\n", + "| epoch 10 | 6000/13484 batches | lr 3.15 | ms/batch 115.38 | loss 4.82 | ppl 123.72\n", + "| epoch 10 | 6200/13484 batches | lr 3.15 | ms/batch 115.35 | loss 4.85 | ppl 127.61\n", + "| epoch 10 | 6400/13484 batches | lr 3.15 | ms/batch 115.32 | loss 4.82 | ppl 124.04\n", + "| epoch 10 | 6600/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.85 | ppl 127.34\n", + "| epoch 10 | 6800/13484 batches | lr 3.15 | ms/batch 115.38 | loss 4.80 | ppl 121.21\n", + "| epoch 10 | 7000/13484 batches | lr 3.15 | ms/batch 115.40 | loss 4.84 | ppl 126.43\n", + "| epoch 10 | 7200/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.81 | ppl 122.41\n", + "| epoch 10 | 7400/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.81 | ppl 122.46\n", + "| epoch 10 | 7600/13484 batches | lr 3.15 | ms/batch 115.37 | loss 4.80 | ppl 122.05\n", + "| epoch 10 | 7800/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.80 | ppl 121.58\n", + "| epoch 10 | 8000/13484 batches | lr 3.15 | ms/batch 115.32 | loss 4.79 | ppl 120.04\n", + "| epoch 10 | 8200/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.78 | ppl 118.97\n", + "| epoch 10 | 8400/13484 batches | lr 3.15 | ms/batch 115.44 | loss 4.80 | ppl 121.55\n", + "| epoch 10 | 8600/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.82 | ppl 123.48\n", + "| epoch 10 | 8800/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.79 | ppl 119.98\n", + "| epoch 10 | 9000/13484 batches | lr 3.15 | ms/batch 115.37 | loss 4.80 | ppl 121.60\n", + "| epoch 10 | 9200/13484 batches | lr 3.15 | ms/batch 115.50 | loss 4.82 | ppl 124.26\n", + "| epoch 10 | 9400/13484 batches | lr 3.15 | ms/batch 115.44 | loss 4.83 | ppl 124.68\n", + "| epoch 10 | 9600/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.79 | ppl 120.72\n", + "| epoch 10 | 9800/13484 batches | lr 3.15 | ms/batch 115.40 | loss 4.77 | ppl 118.27\n", + "| epoch 10 | 10000/13484 batches | lr 3.15 | ms/batch 115.43 | loss 4.80 | ppl 121.83\n", + "| epoch 10 | 10200/13484 batches | lr 3.15 | ms/batch 115.45 | loss 4.76 | ppl 116.50\n", + "| epoch 10 | 10400/13484 batches | lr 3.15 | ms/batch 115.34 | loss 4.74 | ppl 114.00\n", + "| epoch 10 | 10600/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.79 | ppl 119.78\n", + "| epoch 10 | 10800/13484 batches | lr 3.15 | ms/batch 115.45 | loss 4.80 | ppl 121.63\n", + "| epoch 10 | 11000/13484 batches | lr 3.15 | ms/batch 115.32 | loss 4.80 | ppl 121.41\n", + "| epoch 10 | 11200/13484 batches | lr 3.15 | ms/batch 115.52 | loss 4.80 | ppl 121.11\n", + "| epoch 10 | 11400/13484 batches | lr 3.15 | ms/batch 115.40 | loss 4.75 | ppl 115.26\n", + "| epoch 10 | 11600/13484 batches | lr 3.15 | ms/batch 115.45 | loss 4.79 | ppl 120.63\n", + "| epoch 10 | 11800/13484 batches | lr 3.15 | ms/batch 115.36 | loss 4.75 | ppl 115.77\n", + "| epoch 10 | 12000/13484 batches | lr 3.15 | ms/batch 115.48 | loss 4.80 | ppl 121.70\n", + "| epoch 10 | 12200/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.74 | ppl 114.59\n", + "| epoch 10 | 12400/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.76 | ppl 116.79\n", + "| epoch 10 | 12600/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.77 | ppl 118.25\n", + "| epoch 10 | 12800/13484 batches | lr 3.15 | ms/batch 115.40 | loss 4.79 | ppl 120.47\n", + "| epoch 10 | 13000/13484 batches | lr 3.15 | ms/batch 115.41 | loss 4.78 | ppl 119.53\n", + "| epoch 10 | 13200/13484 batches | lr 3.15 | ms/batch 115.42 | loss 4.79 | ppl 120.28\n", + "| epoch 10 | 13400/13484 batches | lr 3.15 | ms/batch 115.39 | loss 4.81 | ppl 122.17\n", + "-----------------------------------------------------------------------------------------\n", + "| end of epoch 10 | time: 1628.47s | valid loss 4.98 | valid ppl 145.33\n", "-----------------------------------------------------------------------------------------\n" ] } ], "source": [ "best_val_loss = float('inf')\n", - "epochs = 50\n", + "epochs = 10\n", "best_model = None\n", "\n", "for epoch in range(1, epochs + 1):\n", @@ -1704,7 +1495,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 32, "id": "12fdd0aa", "metadata": { "scrolled": true @@ -1715,7 +1506,7 @@ "output_type": "stream", "text": [ "=========================================================================================\n", - "| End of training | test loss 5.36 | test ppl 213.09\n", + "| End of training | test loss 4.98 | test ppl 144.89\n", "=========================================================================================\n" ] } @@ -1739,7 +1530,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 33, "id": "848af399", "metadata": {}, "outputs": [], @@ -1765,13 +1556,13 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 157, "id": "cfb30fe0", "metadata": {}, "outputs": [], "source": [ "sample_batch = [\n", - " \"The brain is\",\n", + " \"Hello World\"\n", "]\n", "input_batch = sample_batch" ] @@ -1786,12 +1577,12 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 158, "id": "305853e8", "metadata": {}, "outputs": [], "source": [ - "bptt = 3\n", + "bptt = 2\n", "src_mask = generate_square_subsequent_mask(bptt).to(device)" ] }, @@ -1805,7 +1596,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 159, "id": "afe585d6", "metadata": {}, "outputs": [], @@ -1825,7 +1616,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 154, "id": "8bfaa8bd", "metadata": {}, "outputs": [], @@ -1845,21 +1636,10 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "6e2c35ba", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda')" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "device" @@ -1875,21 +1655,53 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 50, "id": "223eed8a", "metadata": {}, "outputs": [ { - "ename": "RuntimeError", - "evalue": "Error(s) in loading state_dict for TransformerModel:\n\tsize mismatch for encoder.weight: copying a param with shape torch.Size([84399, 200]) from checkpoint, the shape in current model is torch.Size([6526, 200]).\n\tsize mismatch for decoder.weight: copying a param with shape torch.Size([84399, 200]) from checkpoint, the shape in current model is torch.Size([6526, 200]).\n\tsize mismatch for decoder.bias: copying a param with shape torch.Size([84399]) from checkpoint, the shape in current model is torch.Size([6526]).", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [40], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m best_model \u001b[38;5;241m=\u001b[39m TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m----> 2\u001b[0m \u001b[43mbest_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mautocomplete_model\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1667\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[0;34m(self, state_dict, strict)\u001b[0m\n\u001b[1;32m 1662\u001b[0m error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 1664\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(k) \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[1;32m 1666\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1667\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[1;32m 1668\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n", - "\u001b[0;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for TransformerModel:\n\tsize mismatch for encoder.weight: copying a param with shape torch.Size([84399, 200]) from checkpoint, the shape in current model is torch.Size([6526, 200]).\n\tsize mismatch for decoder.weight: copying a param with shape torch.Size([84399, 200]) from checkpoint, the shape in current model is torch.Size([6526, 200]).\n\tsize mismatch for decoder.bias: copying a param with shape torch.Size([84399]) from checkpoint, the shape in current model is torch.Size([6526])." - ] + "data": { + "text/plain": [ + "TransformerModel(\n", + " (pos_encoder): PositionalEncoding(\n", + " (dropout): Dropout(p=0.2, inplace=False)\n", + " )\n", + " (transformer_encoder): TransformerEncoder(\n", + " (layers): ModuleList(\n", + " (0): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)\n", + " )\n", + " (linear1): Linear(in_features=200, out_features=200, bias=True)\n", + " (dropout): Dropout(p=0.2, inplace=False)\n", + " (linear2): Linear(in_features=200, out_features=200, bias=True)\n", + " (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)\n", + " (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)\n", + " (dropout1): Dropout(p=0.2, inplace=False)\n", + " (dropout2): Dropout(p=0.2, inplace=False)\n", + " )\n", + " (1): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)\n", + " )\n", + " (linear1): Linear(in_features=200, out_features=200, bias=True)\n", + " (dropout): Dropout(p=0.2, inplace=False)\n", + " (linear2): Linear(in_features=200, out_features=200, bias=True)\n", + " (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)\n", + " (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)\n", + " (dropout1): Dropout(p=0.2, inplace=False)\n", + " (dropout2): Dropout(p=0.2, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (encoder): Embedding(163987, 200)\n", + " (decoder): Linear(in_features=200, out_features=163987, bias=True)\n", + ")" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1907,26 +1719,20 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 160, "id": "64223e87", "metadata": {}, "outputs": [], "source": [ "def predict(input_line, mask, n_predictions=3):\n", - " print('\\n> %s' % input_line)\n", " with torch.no_grad():\n", - " output = best_model(input_line.to(device), mask)\n", - "\n", - " # Get top N categories\n", - " topv, topi = output.topk(n_predictions, 1, True)\n", - "\n", + " output = best_model(input_line.to(device), mask) \n", " predictions = []\n", " for i in range(n_predictions):\n", - " value = topv[0][i]\n", - " v1, v2 = value.topk(1)\n", - " predict_token_index = v2.cpu().detach().numpy()\n", - " print(\"predict token index: \", predict_token_index)\n", + " next_item = output.topk(i+1)[1].view(-1)[-1].item()\n", + " predict_token_index = next_item\n", " predictions.append(vocab.lookup_token(predict_token_index))\n", + " \n", " return predictions" ] }, @@ -1940,7 +1746,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 161, "id": "b2895698", "metadata": {}, "outputs": [], @@ -1950,19 +1756,17 @@ " is_terminated = False\n", " input_batch = sample_batch\n", " while(not is_terminated):\n", - " # 2*count is need because spaces count aswell\n", " mask_size = bptt+(iteration) \n", " src_mask = generate_square_subsequent_mask(mask_size).to(device)\n", " data = toDataTensor(input_batch)\n", " \n", " for i, d in enumerate(data):\n", " predictions = predict(d, src_mask, num_of_pred)\n", - " print(\"Current input:\", i)\n", - " print(input_batch[i])\n", - " print(\"Possible continuations:\")\n", + " \n", + " print(\"\\n Possible continuations:\")\n", " for j in range(len(predictions)):\n", " print(j + 1, \": \", predictions[j])\n", - " s_index = input(\"Choose continuation by index:\")\n", + " s_index = input(input_batch[i])\n", " if(\"e\" in s_index):\n", " is_terminated = True\n", " print(\"prediction stopped.\")\n", @@ -1977,7 +1781,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "id": "13ed9298", "metadata": {}, "outputs": [ @@ -1985,157 +1789,87 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "> tensor([ 3, 161, 18])\n", - "predict token index: [2]\n", - "predict token index: [5]\n", - "predict token index: [3]\n", - "Current input: 0\n", - "The brain is\n", "Possible continuations:\n", - "1 : ,\n", - "2 : of\n", - "3 : the\n", - "Choose continuation by index:3\n", + "1 : health\n", + "2 : .\n", + "3 : ,\n", + "Hello World2\n", "Text is now:\n", - "The brain is the\n", - "\n", - "> tensor([ 3, 374, 18])\n", - "predict token index: [2]\n", - "predict token index: [5]\n", - "predict token index: [183]\n", - "Current input: 1\n", - "The lung is\n", + "Hello World .\n", "Possible continuations:\n", - "1 : ,\n", - "2 : of\n", - "3 : identified\n", - "Choose continuation by index:3\n", + "1 : the\n", + "2 : in\n", + "3 : this\n", + "Hello World .2\n", "Text is now:\n", - "The lung is identified\n", - "\n", - "> tensor([ 3, 161, 18, 3])\n", - "predict token index: [2]\n", - "predict token index: [5]\n", - "predict token index: [132]\n", - "Current input: 0\n", - "The brain is the\n", + "Hello World . in\n", "Possible continuations:\n", - "1 : ,\n", - "2 : of\n", - "3 : most\n", - "Choose continuation by index:3\n", + "1 : the\n", + "2 : a\n", + "3 : blood\n", + "Hello World . in1\n", "Text is now:\n", - "The brain is the most\n", - "\n", - "> tensor([ 3, 374, 18, 183])\n", - "predict token index: [2]\n", - "predict token index: [5]\n", - "predict token index: [8]\n", - "Current input: 1\n", - "The lung is identified\n", + "Hello World . in the\n", "Possible continuations:\n", - "1 : ,\n", - "2 : of\n", - "3 : in\n", - "Choose continuation by index:1\n", + "1 : blood\n", + "2 : effect\n", + "3 : same\n", + "Hello World . in the1\n", "Text is now:\n", - "The lung is identified ,\n", - "\n", - "> tensor([ 3, 161, 18, 3, 132])\n", - "predict token index: [258]\n", - "predict token index: [5]\n", - "predict token index: [5]\n", - "Current input: 0\n", - "The brain is the most\n", + "Hello World . in the blood\n", "Possible continuations:\n", - "1 : common\n", - "2 : of\n", - "3 : of\n", - "Choose continuation by index:1\n", + "1 : flow\n", + "2 : pressure\n", + "3 : vessels\n", + "Hello World . in the blood2\n", "Text is now:\n", - "The brain is the most common\n", - "\n", - "> tensor([ 3, 374, 18, 183, 2])\n", - "predict token index: [4]\n", - "predict token index: [4]\n", - "predict token index: [3]\n", - "Current input: 1\n", - "The lung is identified ,\n", + "Hello World . in the blood pressure\n", "Possible continuations:\n", "1 : and\n", - "2 : and\n", - "3 : the\n", - "Choose continuation by index:3\n", + "2 : (\n", + "3 : ,\n", + "Hello World . in the blood pressure1\n", "Text is now:\n", - "The lung is identified , the\n", - "\n", - "> tensor([ 3, 161, 18, 3, 132, 258])\n", - "predict token index: [258]\n", - "predict token index: [1]\n", - "predict token index: [5]\n", - "Current input: 0\n", - "The brain is the most common\n", + "Hello World . in the blood pressure and\n", "Possible continuations:\n", - "1 : common\n", - "2 : .\n", - "3 : of\n", - "Choose continuation by index:3\n", + "1 : the\n", + "2 : in\n", + "3 : a\n", + "Hello World . in the blood pressure and1\n", "Text is now:\n", - "The brain is the most common of\n", - "\n", - "> tensor([ 3, 374, 18, 183, 2, 3])\n", - "predict token index: [4]\n", - "predict token index: [4]\n", - "predict token index: [3]\n", - "Current input: 1\n", - "The lung is identified , the\n", + "Hello World . in the blood pressure and the\n", "Possible continuations:\n", - "1 : and\n", - "2 : and\n", - "3 : the\n", - "Choose continuation by index:1\n", + "1 : blood\n", + "2 : effect\n", + "3 : same\n", + "Hello World . in the blood pressure and the1\n", "Text is now:\n", - "The lung is identified , the and\n", - "\n", - "> tensor([ 3, 161, 18, 3, 132, 258, 5])\n", - "predict token index: [258]\n", - "predict token index: [1]\n", - "predict token index: [5]\n", - "Current input: 0\n", - "The brain is the most common of\n", + "Hello World . in the blood pressure and the blood\n", "Possible continuations:\n", - "1 : common\n", - "2 : .\n", - "3 : of\n", - "Choose continuation by index:1\n", + "1 : flow\n", + "2 : pressure\n", + "3 : vessels\n", + "Hello World . in the blood pressure and the blood3\n", "Text is now:\n", - "The brain is the most common of common\n", - "\n", - "> tensor([ 3, 374, 18, 183, 2, 3, 4])\n", - "predict token index: [4]\n", - "predict token index: [4]\n", - "predict token index: [3]\n", - "Current input: 1\n", - "The lung is identified , the and\n", + "Hello World . in the blood pressure and the blood vessels\n", "Possible continuations:\n", - "1 : and\n", - "2 : and\n", - "3 : the\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "Interrupted by user", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [78], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpredict_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn [77], line 17\u001b[0m, in \u001b[0;36mpredict_loop\u001b[0;34m(num_of_pred)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(predictions)):\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(j \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m, predictions[j])\n\u001b[0;32m---> 17\u001b[0m s_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mChoose continuation by index:\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124me\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m s_index):\n\u001b[1;32m 19\u001b[0m is_terminated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1177\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 1173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_allow_stdin:\n\u001b[1;32m 1174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(\n\u001b[1;32m 1175\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1176\u001b[0m )\n\u001b[0;32m-> 1177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1178\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1179\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parent_ident\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1180\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1181\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1219\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1217\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1218\u001b[0m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[0;32m-> 1219\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInterrupted by user\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 1220\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1221\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid Message:\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: Interrupted by user" + "1 : .\n", + "2 : of\n", + "3 : ,\n", + "Hello World . in the blood pressure and the blood vessels2\n", + "Text is now:\n", + "Hello World . in the blood pressure and the blood vessels of\n", + "Possible continuations:\n", + "1 : the\n", + "2 : blood\n", + "3 : a\n", + "Hello World . in the blood pressure and the blood vessels of1\n", + "Text is now:\n", + "Hello World . in the blood pressure and the blood vessels of the\n", + "Possible continuations:\n", + "1 : blood\n", + "2 : effect\n", + "3 : same\n" ] } ],