From f65604709cba682f6b8eea0e3c662fd4c760a045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Constantin=20F=C3=BCrst?= Date: Wed, 18 Jan 2023 13:56:46 +0100 Subject: [PATCH] read pubmed data from formatted text file on cloud storage, retrieved by esearch/efetch-command-line-tools --- AutomaticSentenceCompletion.ipynb | 117 ++++++++++++++++++++++-------- 1 file changed, 86 insertions(+), 31 deletions(-) diff --git a/AutomaticSentenceCompletion.ipynb b/AutomaticSentenceCompletion.ipynb index 70ddfe1..1efb201 100644 --- a/AutomaticSentenceCompletion.ipynb +++ b/AutomaticSentenceCompletion.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "id": "e444b44c", "metadata": {}, "outputs": [], @@ -41,6 +41,35 @@ " from Bio import Entrez, Medline \n" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3209935b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-01-18 14:27:45-- https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download\n", + "Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n", + "Resolving cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)... 95.91.21.14\n", + "Connecting to cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)|95.91.21.14|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1100551 (1.0M) [text/plain]\n", + "Saving to: ‘pubmed-query.txt’\n", + "\n", + "pubmed-query.txt 100%[===================>] 1.05M 1.91MB/s in 0.6s \n", + "\n", + "2023-01-18 14:27:47 (1.91 MB/s) - ‘pubmed-query.txt’ saved [1100551/1100551]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download -O pubmed-query.txt" + ] + }, { "cell_type": "markdown", "id": "7bf15c30", @@ -51,18 +80,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 5, "id": "adfb256a", "metadata": {}, "outputs": [], "source": [ - "def getPapers(myQuery, maxPapers, myEmail =\"leonard.starke@mailbox.tu-dresden.de\"):\n", - " # Get articles from PubMed\n", - " Entrez.email = myEmail\n", - " record = Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n", - " idlist = record[\"IdList\"]\n", - " print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n", - " records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n", + "def getPapers(filename):\n", + " pubmed_query = open(filename, encoding='utf-8')\n", + " records = Medline.parse(pubmed_query)\n", " return list(records)" ] }, @@ -76,17 +101,7 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "39c3b352", - "metadata": {}, - "outputs": [], - "source": [ - "amountOfPapers = 100000" - ] - }, - { - "cell_type": "code", - "execution_count": 27, + "execution_count": 6, "id": "00481ec9", "metadata": {}, "outputs": [ @@ -94,15 +109,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", - "There are 9999 records for Blood [tiab].\n" + "Got 290 records from the query text file\n" ] } ], "source": [ - "myQuery =\"Blood [tiab]\" #query in title and abstract\n", - "maxPapers = amountOfPapers\n", - "records = getPapers(myQuery, maxPapers)" + "records = getPapers(\"pubmed-query.txt\")\n", + "print(f\"Got {len(records)} records from the query text file\")" ] }, { @@ -115,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 7, "id": "dcf5c217", "metadata": {}, "outputs": [], @@ -177,7 +190,7 @@ }, { "cell_type": "markdown", - "id": "ec1db50b", + "id": "683ed2fc", "metadata": {}, "source": [ "### import math module" @@ -186,7 +199,7 @@ { "cell_type": "code", "execution_count": 33, - "id": "eb32bd79", + "id": "8d2312db", "metadata": {}, "outputs": [], "source": [ @@ -1917,7 +1930,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "id": "b2895698", "metadata": {}, "outputs": [], @@ -1945,7 +1958,7 @@ " break\n", "\n", " print(\"Text is now:\")\n", - " input_batch[i] += (\" \" + predictions[int(s_index) -1])\n", + " input_batch[i] += (\" \" + predictions[int(s_index) - 1])\n", " print(input_batch[i])\n", "\n", " iteration = iteration + 1" @@ -1953,7 +1966,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "id": "13ed9298", "metadata": {}, "outputs": [ @@ -2069,8 +2082,50 @@ "Possible continuations:\n", "1 : and\n", "2 : and\n", + "3 : the\n", + "Choose continuation by index:1\n", + "Text is now:\n", + "The lung is identified , the and\n", + "\n", + "> tensor([ 3, 161, 18, 3, 132, 258, 5])\n", + "predict token index: [258]\n", + "predict token index: [1]\n", + "predict token index: [5]\n", + "Current input: 0\n", + "The brain is the most common of\n", + "Possible continuations:\n", + "1 : common\n", + "2 : .\n", + "3 : of\n", + "Choose continuation by index:1\n", + "Text is now:\n", + "The brain is the most common of common\n", + "\n", + "> tensor([ 3, 374, 18, 183, 2, 3, 4])\n", + "predict token index: [4]\n", + "predict token index: [4]\n", + "predict token index: [3]\n", + "Current input: 1\n", + "The lung is identified , the and\n", + "Possible continuations:\n", + "1 : and\n", + "2 : and\n", "3 : the\n" ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "Interrupted by user", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [78], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpredict_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn [77], line 17\u001b[0m, in \u001b[0;36mpredict_loop\u001b[0;34m(num_of_pred)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(predictions)):\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(j \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m, predictions[j])\n\u001b[0;32m---> 17\u001b[0m s_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mChoose continuation by index:\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124me\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m s_index):\n\u001b[1;32m 19\u001b[0m is_terminated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1177\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 1173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_allow_stdin:\n\u001b[1;32m 1174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(\n\u001b[1;32m 1175\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1176\u001b[0m )\n\u001b[0;32m-> 1177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1178\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1179\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parent_ident\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1180\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1181\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1219\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1217\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1218\u001b[0m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[0;32m-> 1219\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInterrupted by user\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 1220\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1221\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid Message:\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: Interrupted by user" + ] } ], "source": [