Browse Source

parse abstracts from PubMedDB and store them in local python var

dev_neuralnet
Leonard Starke 2 years ago
parent
commit
b910acbb7a
  1. 150
      AutomaticSentenceCompletion.ipynb

150
AutomaticSentenceCompletion.ipynb

@ -18,6 +18,156 @@
"- Shahein Enjjar\t\n",
"- Leonard Starke"
]
},
{
"cell_type": "markdown",
"id": "ee9c1d92",
"metadata": {},
"source": [
"### Firstly try to import the data modules"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e444b44c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting Bio\n",
" Downloading bio-1.4.0-py3-none-any.whl (270 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting mygene\n",
" Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)\n",
"Collecting biopython>=1.79\n",
" Downloading biopython-1.79-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: requests in /usr/lib/python3.10/site-packages (from Bio) (2.28.1)\n",
"Collecting tqdm\n",
" Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy in /usr/lib/python3.10/site-packages (from biopython>=1.79->Bio) (1.23.3)\n",
"Collecting biothings-client>=0.2.6\n",
" Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3.10/site-packages (from requests->Bio) (3.4)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3.10/site-packages (from requests->Bio) (1.26.12)\n",
"Installing collected packages: tqdm, biopython, biothings-client, mygene, Bio\n",
"\u001b[33m WARNING: The script tqdm is installed in '/home/hein/.local/bin' which is not on PATH.\n",
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[33m WARNING: The scripts bio and fasta_filter.py are installed in '/home/hein/.local/bin' which is not on PATH.\n",
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
"\u001b[0mSuccessfully installed Bio-1.4.0 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2 tqdm-4.64.1\n"
]
}
],
"source": [
"try:\n",
" from Bio import Entrez, Medline \n",
"except:\n",
" !pip install Bio\n",
" from Bio import Entrez, Medline \n"
]
},
{
"cell_type": "markdown",
"id": "7bf15c30",
"metadata": {},
"source": [
"### define function for loading the papers from PubMed database"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "adfb256a",
"metadata": {},
"outputs": [],
"source": [
"def getPapers(myQuery, maxPapers, myEmail =\"xxxxx@xxxxxxxx.xx\"):\n",
" # Get articles from PubMed\n",
" Entrez.email =myEmail\n",
" record =Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n",
" idlist = record[\"IdList\"]\n",
" print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n",
" records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n",
" return list(records)"
]
},
{
"cell_type": "markdown",
"id": "46bc6298",
"metadata": {},
"source": [
"### Verify that its working"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "00481ec9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"There are 1000 records for Cancer[tiab].\n"
]
}
],
"source": [
"myQuery =\"Cancer\"+\"[tiab]\" #query in title and abstract\n",
"maxPapers = 1000 # thinkabout outsourcing params to seperate section\n",
"records = getPapers(myQuery, maxPapers)\n"
]
},
{
"cell_type": "markdown",
"id": "b67747c6",
"metadata": {},
"source": [
"### Now extract abstracts from records"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "dcf5c217",
"metadata": {},
"outputs": [],
"source": [
"r_abstracts = []\n",
"for r in records:\n",
" r_abstracts.append(r)"
]
},
{
"cell_type": "markdown",
"id": "e309f6fe",
"metadata": {},
"source": [
"### Now import torch modules needed to load the data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c3199444",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" import torch\n",
" from torch.utils.data import Dataset \n",
"except:\n",
" !pip install pytorch\n",
" \n"
]
}
],
"metadata": {

Loading…
Cancel
Save