diff --git a/AutomaticSentenceCompletion.ipynb b/AutomaticSentenceCompletion.ipynb index 6a1082d..7334bc3 100644 --- a/AutomaticSentenceCompletion.ipynb +++ b/AutomaticSentenceCompletion.ipynb @@ -18,6 +18,156 @@ "- Shahein Enjjar\t\n", "- Leonard Starke" ] + }, + { + "cell_type": "markdown", + "id": "ee9c1d92", + "metadata": {}, + "source": [ + "### Firstly try to import the data modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e444b44c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Collecting Bio\n", + " Downloading bio-1.4.0-py3-none-any.whl (270 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m270.9/270.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting mygene\n", + " Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)\n", + "Collecting biopython>=1.79\n", + " Downloading biopython-1.79-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests in /usr/lib/python3.10/site-packages (from Bio) (2.28.1)\n", + "Collecting tqdm\n", + " Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/lib/python3.10/site-packages (from biopython>=1.79->Bio) (1.23.3)\n", + "Collecting biothings-client>=0.2.6\n", + " Downloading biothings_client-0.2.6-py2.py3-none-any.whl (37 kB)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3.10/site-packages (from requests->Bio) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3.10/site-packages (from requests->Bio) (1.26.12)\n", + "Installing collected packages: tqdm, biopython, biothings-client, mygene, Bio\n", + "\u001b[33m WARNING: The script tqdm is installed in '/home/hein/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The scripts bio and fasta_filter.py are installed in '/home/hein/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n", + "\u001b[0mSuccessfully installed Bio-1.4.0 biopython-1.79 biothings-client-0.2.6 mygene-3.2.2 tqdm-4.64.1\n" + ] + } + ], + "source": [ + "try:\n", + " from Bio import Entrez, Medline \n", + "except:\n", + " !pip install Bio\n", + " from Bio import Entrez, Medline \n" + ] + }, + { + "cell_type": "markdown", + "id": "7bf15c30", + "metadata": {}, + "source": [ + "### define function for loading the papers from PubMed database" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "adfb256a", + "metadata": {}, + "outputs": [], + "source": [ + "def getPapers(myQuery, maxPapers, myEmail =\"xxxxx@xxxxxxxx.xx\"):\n", + " # Get articles from PubMed\n", + " Entrez.email =myEmail\n", + " record =Entrez.read(Entrez.esearch(db=\"pubmed\", term=myQuery, retmax=maxPapers))\n", + " idlist = record[\"IdList\"]\n", + " print(\"\\nThere are %d records for %s.\"%(len(idlist), myQuery.strip()))\n", + " records = Medline.parse(Entrez.efetch(db=\"pubmed\", id=idlist, rettype=\"medline\", retmode=\"text\"))\n", + " return list(records)" + ] + }, + { + "cell_type": "markdown", + "id": "46bc6298", + "metadata": {}, + "source": [ + "### Verify that its working" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "00481ec9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "There are 1000 records for Cancer[tiab].\n" + ] + } + ], + "source": [ + "myQuery =\"Cancer\"+\"[tiab]\" #query in title and abstract\n", + "maxPapers = 1000 # thinkabout outsourcing params to seperate section\n", + "records = getPapers(myQuery, maxPapers)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b67747c6", + "metadata": {}, + "source": [ + "### Now extract abstracts from records" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "dcf5c217", + "metadata": {}, + "outputs": [], + "source": [ + "r_abstracts = []\n", + "for r in records:\n", + " r_abstracts.append(r)" + ] + }, + { + "cell_type": "markdown", + "id": "e309f6fe", + "metadata": {}, + "source": [ + "### Now import torch modules needed to load the data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c3199444", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import torch\n", + " from torch.utils.data import Dataset \n", + "except:\n", + " !pip install pytorch\n", + " \n" + ] } ], "metadata": {