|
|
@ -18,6 +18,156 @@ |
|
|
|
"- Shahein Enjjar\t\n", |
|
|
|
"- Leonard Starke" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "ee9c1d92", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### Firstly try to import the data modules" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"id": "e444b44c", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"Defaulting to user installation because normal site-packages is not writeable\n", |
|
|
|
"Collecting Bio\n", |
|
|
|
" Downloading bio-1.4.0-py3-none-any.whl (270 kB)\n", |
|
|
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", |
|
|
|
"\u001b[?25hCollecting mygene\n", |
|
|
|
" Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)\n", |
|
|
|
"Collecting biopython>=1.79\n", |
|
|
|
" Downloading biopython-1.79-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", |
|
|
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", |
|
|
|
"\u001b[?25hRequirement already satisfied: requests in /usr/lib/python3.10/site-packages (from Bio) (2.28.1)\n", |
|
|
|
"Collecting tqdm\n", |
|
|
|
" Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n", |
|
|
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", |
|
|
|
"\u001b[?25hRequirement already satisfied: numpy in /usr/lib/python3.10/site-packages (from biopython>=1.79->Bio) (1.23.3)\n", |
|
|
|
"Collecting biothings-client>=0.2.6\n", |
|
|
|
" Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)\n", |
|
|
|
"Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3.10/site-packages (from requests->Bio) (3.4)\n", |
|
|
|
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3.10/site-packages (from requests->Bio) (1.26.12)\n", |
|
|
|
"Installing collected packages: tqdm, biopython, biothings-client, mygene, Bio\n", |
|
|
|
"\u001b[33m WARNING: The script tqdm is installed in '/home/hein/.local/bin' which is not on PATH.\n", |
|
|
|
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", |
|
|
|
"\u001b[0m\u001b[33m WARNING: The scripts bio and fasta_filter.py are installed in '/home/hein/.local/bin' which is not on PATH.\n", |
|
|
|
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", |
|
|
|
"\u001b[0mSuccessfully installed Bio-1.4.0 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2 tqdm-4.64.1\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"try:\n", |
|
|
|
" from Bio import Entrez, Medline \n", |
|
|
|
"except:\n", |
|
|
|
" !pip install Bio\n", |
|
|
|
" from Bio import Entrez, Medline \n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "7bf15c30", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### define function for loading the papers from PubMed database" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 7, |
|
|
|
"id": "adfb256a", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"def getPapers(myQuery, maxPapers, myEmail =\"xxxxx@xxxxxxxx.xx\"):\n", |
|
|
|
" # Get articles from PubMed\n", |
|
|
|
" Entrez.email =myEmail\n", |
|
|
|
" record =Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n", |
|
|
|
" idlist = record[\"IdList\"]\n", |
|
|
|
" print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n", |
|
|
|
" records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n", |
|
|
|
" return list(records)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "46bc6298", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### Verify that its working" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 9, |
|
|
|
"id": "00481ec9", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"\n", |
|
|
|
"There are 1000 records for Cancer[tiab].\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"myQuery =\"Cancer\"+\"[tiab]\" #query in title and abstract\n", |
|
|
|
"maxPapers = 1000 # thinkabout outsourcing params to seperate section\n", |
|
|
|
"records = getPapers(myQuery, maxPapers)\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "b67747c6", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### Now extract abstracts from records" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 30, |
|
|
|
"id": "dcf5c217", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"r_abstracts = []\n", |
|
|
|
"for r in records:\n", |
|
|
|
" r_abstracts.append(r)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"id": "e309f6fe", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"### Now import torch modules needed to load the data" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 11, |
|
|
|
"id": "c3199444", |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"try:\n", |
|
|
|
" import torch\n", |
|
|
|
" from torch.utils.data import Dataset \n", |
|
|
|
"except:\n", |
|
|
|
" !pip install pytorch\n", |
|
|
|
" \n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|