parse abstracts from PubMedDB and store them in local python var

2 years ago · b910acbb7a
1 changed files with 150 additions and 0 deletions
--- a/AutomaticSentenceCompletion.ipynb
+++ b/AutomaticSentenceCompletion.ipynb
@ -18,6 +18,156 @@
    "- Shahein Enjjar\t\n",
    "- Leonard Starke"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee9c1d92",
+   "metadata": {},
+   "source": [
+    "### Firstly try to import the data modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e444b44c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Collecting Bio\n",
+      "  Downloading bio-1.4.0-py3-none-any.whl (270 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hCollecting mygene\n",
+      "  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)\n",
+      "Collecting biopython>=1.79\n",
+      "  Downloading biopython-1.79-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: requests in /usr/lib/python3.10/site-packages (from Bio) (2.28.1)\n",
+      "Collecting tqdm\n",
+      "  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: numpy in /usr/lib/python3.10/site-packages (from biopython>=1.79->Bio) (1.23.3)\n",
+      "Collecting biothings-client>=0.2.6\n",
+      "  Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3.10/site-packages (from requests->Bio) (3.4)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3.10/site-packages (from requests->Bio) (1.26.12)\n",
+      "Installing collected packages: tqdm, biopython, biothings-client, mygene, Bio\n",
+      "\u001b[33m  WARNING: The script tqdm is installed in '/home/hein/.local/bin' which is not on PATH.\n",
+      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\u001b[33m  WARNING: The scripts bio and fasta_filter.py are installed in '/home/hein/.local/bin' which is not on PATH.\n",
+      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
+      "\u001b[0mSuccessfully installed Bio-1.4.0 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2 tqdm-4.64.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "   from Bio import Entrez, Medline \n",
+    "except:\n",
+    "   !pip install Bio\n",
+    "   from Bio import Entrez, Medline \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bf15c30",
+   "metadata": {},
+   "source": [
+    "### define function for loading the papers from PubMed database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "adfb256a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getPapers(myQuery, maxPapers, myEmail =\"xxxxx@xxxxxxxx.xx\"):\n",
+    "    # Get articles from PubMed\n",
+    "    Entrez.email =myEmail\n",
+    "    record =Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n",
+    "    idlist = record[\"IdList\"]\n",
+    "    print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n",
+    "    records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n",
+    "    return list(records)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46bc6298",
+   "metadata": {},
+   "source": [
+    "### Verify that its working"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "00481ec9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "There are 1000 records for Cancer[tiab].\n"
+     ]
+    }
+   ],
+   "source": [
+    "myQuery =\"Cancer\"+\"[tiab]\" #query in title and abstract\n",
+    "maxPapers = 1000 # thinkabout outsourcing params to seperate section\n",
+    "records = getPapers(myQuery, maxPapers)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b67747c6",
+   "metadata": {},
+   "source": [
+    "### Now extract abstracts from records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "dcf5c217",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r_abstracts = []\n",
+    "for r in records:\n",
+    "    r_abstracts.append(r)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e309f6fe",
+   "metadata": {},
+   "source": [
+    "### Now import torch modules needed to load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c3199444",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "   import torch\n",
+    "   from torch.utils.data import Dataset \n",
+    "except:\n",
+    "   !pip install pytorch\n",
+    "   \n"
+   ]
  }
 ],
 "metadata": {