Browse Source

read pubmed data from formatted text file on cloud storage, retrieved by esearch/efetch-command-line-tools

dev_neuralnet
Constantin Fürst 2 years ago
parent
commit
f65604709c
  1. 117
      AutomaticSentenceCompletion.ipynb

117
AutomaticSentenceCompletion.ipynb

@ -29,7 +29,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25,
"execution_count": 1,
"id": "e444b44c", "id": "e444b44c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -41,6 +41,35 @@
" from Bio import Entrez, Medline \n" " from Bio import Entrez, Medline \n"
] ]
}, },
{
"cell_type": "code",
"execution_count": 2,
"id": "3209935b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2023-01-18 14:27:45-- https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download\n",
"Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'\n",
"Resolving cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)... 95.91.21.14\n",
"Connecting to cloud.constantin-fuerst.com (cloud.constantin-fuerst.com)|95.91.21.14|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1100551 (1.0M) [text/plain]\n",
"Saving to: ‘pubmed-query.txt’\n",
"\n",
"pubmed-query.txt 100%[===================>] 1.05M 1.91MB/s in 0.6s \n",
"\n",
"2023-01-18 14:27:47 (1.91 MB/s) - ‘pubmed-query.txt’ saved [1100551/1100551]\n",
"\n"
]
}
],
"source": [
"!wget https://cloud.constantin-fuerst.com/s/944x5BpTQM7GjtF/download -O pubmed-query.txt"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "7bf15c30", "id": "7bf15c30",
@ -51,18 +80,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26,
"execution_count": 5,
"id": "adfb256a", "id": "adfb256a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def getPapers(myQuery, maxPapers, myEmail =\"leonard.starke@mailbox.tu-dresden.de\"):\n",
" # Get articles from PubMed\n",
" Entrez.email = myEmail\n",
" record = Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n",
" idlist = record[\"IdList\"]\n",
" print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n",
" records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n",
"def getPapers(filename):\n",
" pubmed_query = open(filename, encoding='utf-8')\n",
" records = Medline.parse(pubmed_query)\n",
" return list(records)" " return list(records)"
] ]
}, },
@ -76,17 +101,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4,
"id": "39c3b352",
"metadata": {},
"outputs": [],
"source": [
"amountOfPapers = 100000"
]
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 6,
"id": "00481ec9", "id": "00481ec9",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -94,15 +109,13 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\n",
"There are 9999 records for Blood [tiab].\n"
"Got 290 records from the query text file\n"
] ]
} }
], ],
"source": [ "source": [
"myQuery =\"Blood [tiab]\" #query in title and abstract\n",
"maxPapers = amountOfPapers\n",
"records = getPapers(myQuery, maxPapers)"
"records = getPapers(\"pubmed-query.txt\")\n",
"print(f\"Got {len(records)} records from the query text file\")"
] ]
}, },
{ {
@ -115,7 +128,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28,
"execution_count": 7,
"id": "dcf5c217", "id": "dcf5c217",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -177,7 +190,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "ec1db50b",
"id": "683ed2fc",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### import math module" "### import math module"
@ -186,7 +199,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 33,
"id": "eb32bd79",
"id": "8d2312db",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1917,7 +1930,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 77,
"execution_count": null,
"id": "b2895698", "id": "b2895698",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -1945,7 +1958,7 @@
" break\n", " break\n",
"\n", "\n",
" print(\"Text is now:\")\n", " print(\"Text is now:\")\n",
" input_batch[i] += (\" \" + predictions[int(s_index) -1])\n",
" input_batch[i] += (\" \" + predictions[int(s_index) - 1])\n",
" print(input_batch[i])\n", " print(input_batch[i])\n",
"\n", "\n",
" iteration = iteration + 1" " iteration = iteration + 1"
@ -1953,7 +1966,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 78,
"id": "13ed9298", "id": "13ed9298",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -2069,8 +2082,50 @@
"Possible continuations:\n", "Possible continuations:\n",
"1 : and\n", "1 : and\n",
"2 : and\n", "2 : and\n",
"3 : the\n",
"Choose continuation by index:1\n",
"Text is now:\n",
"The lung is identified , the and\n",
"\n",
"> tensor([ 3, 161, 18, 3, 132, 258, 5])\n",
"predict token index: [258]\n",
"predict token index: [1]\n",
"predict token index: [5]\n",
"Current input: 0\n",
"The brain is the most common of\n",
"Possible continuations:\n",
"1 : common\n",
"2 : .\n",
"3 : of\n",
"Choose continuation by index:1\n",
"Text is now:\n",
"The brain is the most common of common\n",
"\n",
"> tensor([ 3, 374, 18, 183, 2, 3, 4])\n",
"predict token index: [4]\n",
"predict token index: [4]\n",
"predict token index: [3]\n",
"Current input: 1\n",
"The lung is identified , the and\n",
"Possible continuations:\n",
"1 : and\n",
"2 : and\n",
"3 : the\n" "3 : the\n"
] ]
},
{
"ename": "KeyboardInterrupt",
"evalue": "Interrupted by user",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn [78], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpredict_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn [77], line 17\u001b[0m, in \u001b[0;36mpredict_loop\u001b[0;34m(num_of_pred)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(predictions)):\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28mprint\u001b[39m(j \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;124m\"\u001b[39m, predictions[j])\n\u001b[0;32m---> 17\u001b[0m s_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mChoose continuation by index:\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124me\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m s_index):\n\u001b[1;32m 19\u001b[0m is_terminated \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1177\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 1173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_allow_stdin:\n\u001b[1;32m 1174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(\n\u001b[1;32m 1175\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1176\u001b[0m )\n\u001b[0;32m-> 1177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1178\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1179\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parent_ident\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1180\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1181\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1182\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/usr/lib/python3.10/site-packages/ipykernel/kernelbase.py:1219\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 1216\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 1217\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1218\u001b[0m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[0;32m-> 1219\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInterrupted by user\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 1220\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1221\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid Message:\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: Interrupted by user"
]
} }
], ],
"source": [ "source": [

Loading…
Cancel
Save